From 909200cb473a55ca9f0fb4416c9dc8e47d6a8ceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Sat, 16 Nov 2019 09:35:44 +0100 Subject: [PATCH 1/4] BUG: resolved problem with DataFrame.equals() wrongly returning True (#28839) The function was returning True in case shown in added test. The cause of the problem was sorting Blocks of DataFrame by type, and then mgr_locs before comparison. It resulted in arranging the identical blocks in the same way, which resulted in having the same two lists of blocks. Changing sorting order to (mgr_locs, type) resolves the problem, while not interrupting the other aspects of comparison. --- pandas/core/internals/managers.py | 2 +- pandas/tests/internals/test_internals.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8a9410c076f9b..1be41298d8764 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1399,7 +1399,7 @@ def equals(self, other): # blocks (say, Categorical) which can only be distinguished by # the iteration order def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) + return (block.mgr_locs.as_array.tolist(), block.dtype.name) self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index dbd84f15d143c..fc08d6cef3ff8 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1297,3 +1297,9 @@ def test_make_block_no_pandas_array(): result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False + + +def test_dataframe_not_equal(): + df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False From 7f9fc44f68fecd7b776fe80f4c03d6f0e63f418a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Mon, 18 Nov 2019 15:30:20 +0100 Subject: [PATCH 2/4] Added issue number to test and changed comment in equals() function --- pandas/core/internals/managers.py | 4 ++-- pandas/tests/internals/test_internals.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1be41298d8764..0e6ba8a2c2a6a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1394,8 +1394,8 @@ def equals(self, other): if len(self.blocks) != len(other.blocks): return False - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated + # canonicalize block order, using a tuple combining the mgr_locs + # then type name because there might be unconsolidated # blocks (say, Categorical) which can only be distinguished by # the iteration order def canonicalize(block): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index fc08d6cef3ff8..c98bdab0df766 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1300,6 +1300,7 @@ def test_make_block_no_pandas_array(): def test_dataframe_not_equal(): + # see GH28839 df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]}) df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False From 434d6563b7077485fbda6a7436ff28ab46c8f8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Mon, 18 Nov 2019 15:59:22 +0100 Subject: [PATCH 3/4] Added whatsnew entry --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 30a828064f812..f33c507fd3d1c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -451,6 +451,7 @@ Reshaping - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) +- Bug where :meth:`DataFrame.equals` returned True incorrectly in some circumstances(:issue:`28839`) Sparse ^^^^^^ From b7087b6d092f259a1748872a62050110f3379f65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Mon, 18 Nov 2019 19:59:38 +0100 Subject: [PATCH 4/4] Minor change to whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f33c507fd3d1c..bf30f2d356b44 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -451,7 +451,7 @@ Reshaping - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) -- Bug where :meth:`DataFrame.equals` returned True incorrectly in some circumstances(:issue:`28839`) +- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) Sparse ^^^^^^