From 5970d4111a54975d8c5e519f4d4f90673e9b6708 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Tue, 1 Dec 2020 11:02:43 +0700 Subject: [PATCH 1/9] PERF: fix assert_frame_equal can be very slow --- pandas/_testing.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index bfff4301c2220..a6839f8364da0 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1289,6 +1289,7 @@ def assert_series_equal( rtol=1.0e-5, atol=1.0e-8, obj="Series", + check_index=True, ): """ Check that left and right Series are equal. @@ -1348,6 +1349,10 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. + check_index : bool, default True + Whether to check index equivalence. If False, then compare only values. + + .. versionadded:: 1.2.0 Examples -------- @@ -1383,18 +1388,20 @@ def assert_series_equal( if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) + if check_index: + # GH #38183 + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): lidx = left.index ridx = right.index @@ -1703,6 +1710,7 @@ def assert_frame_equal( obj=f'{obj}.iloc[:, {i}] (column name="{col}")', rtol=rtol, atol=atol, + check_index=False, ) From fd5dc6545a0a331db9c1828d15d9fa94c90e59b1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Dec 2020 00:43:46 +0700 Subject: [PATCH 2/9] REF: extract function _assert_series_values_equal --- pandas/_testing.py | 72 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index a6839f8364da0..8ecc2d316d7de 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1289,7 +1289,6 @@ def assert_series_equal( rtol=1.0e-5, atol=1.0e-8, obj="Series", - check_index=True, ): """ Check that left and right Series are equal. @@ -1349,10 +1348,6 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. - check_index : bool, default True - Whether to check index equivalence. If False, then compare only values. - - .. versionadded:: 1.2.0 Examples -------- @@ -1363,6 +1358,55 @@ def assert_series_equal( """ __tracebackhide__ = True + _assert_series_values_equal( + left, + right, + check_dtype=check_dtype, + check_series_type=check_series_type, + check_less_precise=check_less_precise, + check_names=check_names, + check_exact=check_exact, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + check_category_order=check_category_order, + check_freq=check_freq, + check_flags=check_flags, + rtol=rtol, + atol=atol, + obj=obj, + ) + + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + + +def _assert_series_values_equal( + left, + right, + *, + check_dtype=True, + check_series_type=True, + check_less_precise=no_default, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_category_order=True, + check_freq=True, + check_flags=True, + rtol=1.0e-5, + atol=1.0e-8, + obj="Series", +): if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " @@ -1388,20 +1432,6 @@ def assert_series_equal( if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" - if check_index: - # GH #38183 - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): lidx = left.index ridx = right.index @@ -1697,11 +1727,10 @@ def assert_frame_equal( assert col in right lcol = left.iloc[:, i] rcol = right.iloc[:, i] - assert_series_equal( + _assert_series_values_equal( lcol, rcol, check_dtype=check_dtype, - check_index_type=check_index_type, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, @@ -1710,7 +1739,6 @@ def assert_frame_equal( obj=f'{obj}.iloc[:, {i}] (column name="{col}")', rtol=rtol, atol=atol, - check_index=False, ) From e8fe687879e02468dde5b89c3e0cb5e5ace74e40 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Dec 2020 01:06:28 +0700 Subject: [PATCH 3/9] TST: add test with mock --- pandas/tests/util/test_assert_frame_equal.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 40d2763a13489..5989c663ce41c 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest import pandas as pd @@ -285,3 +287,15 @@ def test_allows_duplicate_labels(): with pytest.raises(AssertionError, match=" Date: Wed, 2 Dec 2020 11:05:18 +0700 Subject: [PATCH 4/9] Revert "REF: extract function _assert_series_values_equal" This reverts commit fd5dc6545a0a331db9c1828d15d9fa94c90e59b1. --- pandas/_testing.py | 72 ++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 8ecc2d316d7de..a6839f8364da0 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1289,6 +1289,7 @@ def assert_series_equal( rtol=1.0e-5, atol=1.0e-8, obj="Series", + check_index=True, ): """ Check that left and right Series are equal. @@ -1348,6 +1349,10 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. + check_index : bool, default True + Whether to check index equivalence. If False, then compare only values. + + .. versionadded:: 1.2.0 Examples -------- @@ -1358,55 +1363,6 @@ def assert_series_equal( """ __tracebackhide__ = True - _assert_series_values_equal( - left, - right, - check_dtype=check_dtype, - check_series_type=check_series_type, - check_less_precise=check_less_precise, - check_names=check_names, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - check_category_order=check_category_order, - check_freq=check_freq, - check_flags=check_flags, - rtol=rtol, - atol=atol, - obj=obj, - ) - - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - - -def _assert_series_values_equal( - left, - right, - *, - check_dtype=True, - check_series_type=True, - check_less_precise=no_default, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_category_order=True, - check_freq=True, - check_flags=True, - rtol=1.0e-5, - atol=1.0e-8, - obj="Series", -): if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " @@ -1432,6 +1388,20 @@ def _assert_series_values_equal( if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + if check_index: + # GH #38183 + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.index", + ) + if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): lidx = left.index ridx = right.index @@ -1727,10 +1697,11 @@ def assert_frame_equal( assert col in right lcol = left.iloc[:, i] rcol = right.iloc[:, i] - _assert_series_values_equal( + assert_series_equal( lcol, rcol, check_dtype=check_dtype, + check_index_type=check_index_type, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, @@ -1739,6 +1710,7 @@ def assert_frame_equal( obj=f'{obj}.iloc[:, {i}] (column name="{col}")', rtol=rtol, atol=atol, + check_index=False, ) From f778a7d013f7a2a86d9608982e79e9e21ba37eb3 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 2 Dec 2020 11:12:26 +0700 Subject: [PATCH 5/9] Revert "TST: add test with mock" This reverts commit e8fe687879e02468dde5b89c3e0cb5e5ace74e40. --- pandas/tests/util/test_assert_frame_equal.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 5989c663ce41c..40d2763a13489 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,5 +1,3 @@ -from unittest.mock import patch - import pytest import pandas as pd @@ -287,15 +285,3 @@ def test_allows_duplicate_labels(): with pytest.raises(AssertionError, match=" Date: Wed, 2 Dec 2020 11:18:07 +0700 Subject: [PATCH 6/9] PERF: replace iloc with ixs --- pandas/_testing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index a6839f8364da0..1c5e3d713fb4d 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1695,8 +1695,12 @@ def assert_frame_equal( else: for i, col in enumerate(left.columns): assert col in right - lcol = left.iloc[:, i] - rcol = right.iloc[:, i] + lcol = left._ixs(i, axis=1) + rcol = right._ixs(i, axis=1) + # GH #38183 + # use check_index=False, because we do not want to run + # assert_index_equal for each column, + # as we already checked it for the whole dataframe before. assert_series_equal( lcol, rcol, From 82cb7a39cf1e1495b1cadf27ae16c5ac9bf09665 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 3 Dec 2020 01:00:02 +0700 Subject: [PATCH 7/9] REF: make check_index keyword-only --- pandas/_testing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_testing.py b/pandas/_testing.py index 1c5e3d713fb4d..991dfe209cdd4 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1289,6 +1289,7 @@ def assert_series_equal( rtol=1.0e-5, atol=1.0e-8, obj="Series", + *, check_index=True, ): """ From 59bc277f819034cd1e34f99ff204f093e84aec06 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 24 Dec 2020 16:59:35 +0700 Subject: [PATCH 8/9] REF: revert to iloc --- pandas/_testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 22789c1e73419..2f5bda2cf9513 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1710,8 +1710,8 @@ def assert_frame_equal( else: for i, col in enumerate(left.columns): assert col in right - lcol = left._ixs(i, axis=1) - rcol = right._ixs(i, axis=1) + lcol = left.iloc[:, i] + rcol = right.iloc[:, i] # GH #38183 # use check_index=False, because we do not want to run # assert_index_equal for each column, From 62058bf0284db00df8bad7b4e36bb8a5bb1a946e Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 24 Dec 2020 17:03:41 +0700 Subject: [PATCH 9/9] DOC: update versionadded to 1.3 --- pandas/_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 2f5bda2cf9513..f96645b3805f0 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1358,7 +1358,7 @@ def assert_series_equal( check_index : bool, default True Whether to check index equivalence. If False, then compare only values. - .. versionadded:: 1.2.0 + .. versionadded:: 1.3.0 Examples --------