From bbb7425b36a0e98117f77dbae1e5537b90aa9b38 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Sun, 19 Jan 2025 00:48:12 -0800 Subject: [PATCH 1/5] BUG: Fixed TypeError for Series.isin() when large series and values contains NA (#60678) --- pandas/core/algorithms.py | 13 +++++++++++++ pandas/tests/series/methods/test_isin.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index eefe08859c1e9..471a0b0a1f984 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,6 +23,9 @@ iNaT, lib, ) + +from pandas._libs.missing import NA + from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -544,10 +547,20 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + values_contains_NA = False + + if values.size != 0: + vectorized_check = np.vectorize(lambda v: v is NA) + values_contains_NA = vectorized_check(values).any() + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object + and values_contains_NA == False ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index e997ae32cf2e2..5b1ff210d134c 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,6 +211,24 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype, data, values, expected", [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]) +]) +def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): + # https://github.com/pandas-dev/pandas/issues/60678 + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA + min_isin_comp = 2 + ser = Series(data, dtype=dtype) + expected = pd.Series(expected, dtype="boolean") + + with monkeypatch.context() as m: + m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) + result = ser.isin(values) + tm.assert_series_equal(result, expected) + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] From 672575fb401f4ee3ea111404c2e834ec5df52b13 Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Sun, 19 Jan 2025 01:35:29 -0800 Subject: [PATCH 2/5] Add entry to whatsnew/v3.0.0.rst for bug fixing --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bf1b52d3a0957..125ac5fbaa30c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -804,6 +804,7 @@ Other - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) From dbe3673c2cbbe98627d155e456762cc8bbe9aeac Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Sun, 19 Jan 2025 02:59:17 -0800 Subject: [PATCH 3/5] Replaced np.vectorize() with any() for minor performance improvement and add new test cases --- pandas/core/algorithms.py | 5 ++--- pandas/tests/series/methods/test_isin.py | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 471a0b0a1f984..c70044b3e52c8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -552,9 +552,8 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Ensure values don't contain , otherwise it throws exception with np.in1d values_contains_NA = False - if values.size != 0: - vectorized_check = np.vectorize(lambda v: v is NA) - values_contains_NA = vectorized_check(values).any() + if comps_array.dtype != object and len(values) <= 26: + values_contains_NA = any(v is NA for v in values) if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 5b1ff210d134c..a4d08c227cd24 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -214,7 +214,9 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): @pytest.mark.parametrize("dtype, data, values, expected", [ ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), - ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]) + ("boolean", [pd.NA, False, True], [pd.NA, True, 'a', 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), ]) def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): # https://github.com/pandas-dev/pandas/issues/60678 From cb168261a88a0c038fffaf5b46253d11ba7810cc Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Sun, 19 Jan 2025 16:25:23 -0800 Subject: [PATCH 4/5] Fixed failed pre-commit.ci hooks : Formatting errors in algorithms.py, inconsistent-namespace-usage in test_isin.py, sorted whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/algorithms.py | 19 +++++++++---------- pandas/tests/series/methods/test_isin.py | 24 ++++++++++++++---------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 125ac5fbaa30c..e0f9c1c479d7a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -800,11 +800,11 @@ Other - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`) - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) +- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`) - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) -- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c70044b3e52c8..469434400d81f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -23,9 +23,7 @@ iNaT, lib, ) - from pandas._libs.missing import NA - from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -543,23 +541,24 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: elif isinstance(values.dtype, ExtensionDtype): return isin(np.asarray(comps_array), np.asarray(values)) - # GH16012 - # Ensure np.isin doesn't get object types or it *may* throw an exception - # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # isin is faster for small sizes - # GH60678 # Ensure values don't contain , otherwise it throws exception with np.in1d + values_contains_NA = False - - if comps_array.dtype != object and len(values) <= 26: + + if comps_array.dtype != object and len(values) <= 26: values_contains_NA = any(v is NA for v in values) + # GH16012 + # Ensure np.isin doesn't get object types or it *may* throw an exception + # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), + # isin is faster for small sizes + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object - and values_contains_NA == False + and not values_contains_NA ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index a4d08c227cd24..4f8484252ba8f 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -211,26 +211,30 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("dtype, data, values, expected", [ - ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), - ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), - ("boolean", [pd.NA, False, True], [pd.NA, True, 'a', 20], [True, False, True]), - ("boolean", [pd.NA, False, True], [], [False, False, False]), - ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), -]) +@pytest.mark.parametrize( + "dtype, data, values, expected", + [ + ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]), + ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]), + ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]), + ("boolean", [pd.NA, False, True], [], [False, False, False]), + ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]), + ], +) def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch): # https://github.com/pandas-dev/pandas/issues/60678 - # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and - # values contains pdNA + # combination of large series (> _MINIMUM_COMP_ARR_LEN elements) and + # values contains pdNA min_isin_comp = 2 ser = Series(data, dtype=dtype) - expected = pd.Series(expected, dtype="boolean") + expected = Series(expected, dtype="boolean") with monkeypatch.context() as m: m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp) result = ser.isin(values) tm.assert_series_equal(result, expected) + def test_isin_complex_numbers(): # GH 17927 array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j] From c14e08d363714e463b46f87d12c98cc11d29a4fd Mon Sep 17 00:00:00 2001 From: Akshay Jain Date: Tue, 21 Jan 2025 15:22:37 -0800 Subject: [PATCH 5/5] Combined redundant if-statements to improve readability and performance --- pandas/core/algorithms.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 469434400d81f..aafd802b827a5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -541,24 +541,19 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: elif isinstance(values.dtype, ExtensionDtype): return isin(np.asarray(comps_array), np.asarray(values)) - # GH60678 - # Ensure values don't contain , otherwise it throws exception with np.in1d - - values_contains_NA = False - - if comps_array.dtype != object and len(values) <= 26: - values_contains_NA = any(v is NA for v in values) - # GH16012 # Ensure np.isin doesn't get object types or it *may* throw an exception # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # isin is faster for small sizes + # GH60678 + # Ensure values don't contain , otherwise it throws exception with np.in1d + if ( len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object - and not values_contains_NA + and not any(v is NA for v in values) ): # If the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan