Merge remote-tracking branch 'upstream/main' into bitmask-backed

WillAyd · WillAyd · commit e987dafd6326 · 2023-08-28T09:25:58.000-04:00
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -124,7 +124,7 @@ jobs:
       run: |
         cd asv_bench
         asv machine --yes
-        asv run --quick --dry-run --strict --durations=30 --python=same
+        asv run --quick --dry-run --durations=30 --python=same
 
   build_docker_dev_environment:
     name: Build Docker Dev Environment
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -816,6 +816,7 @@ Reshaping
 - Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`)
 - Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`)
 - Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`)
+- Bug in :func:`merge` when merging on integer ``ExtensionDtype`` and float NumPy dtype raising ``TypeError`` (:issue:`46178`)
 - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
 - Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`)
 - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -153,7 +153,7 @@ Deprecations
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
--
+- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/meson.build b/meson.build
@@ -2,24 +2,19 @@
 project(
     'pandas',
     'c', 'cpp', 'cython',
-    version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(),
+    version: run_command(['python3', 'generate_version.py', '--print'], check: true).stdout().strip(),
     license: 'BSD-3',
     meson_version: '>=1.0.1',
     default_options: [
-        # TODO: investigate, does meson try to compile against debug Python
-        # when buildtype = debug, this seems to be causing problems on CI
-        # where provided Python is not compiled in debug mode
         'buildtype=release',
         # TODO: Reactivate werror, some warnings on Windows
         #'werror=true',
         'c_std=c99'
     ]
 )
 
-py_mod = import('python')
 fs = import('fs')
-py = py_mod.find_installation('python')
-py_dep = py.dependency()
+py = import('python').find_installation()
 tempita = files('generate_pxi.py')
 versioneer = files('generate_version.py')
 
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -375,7 +375,7 @@ cdef class IndexEngine:
         # map each starget to its position in the index
         if (
                 stargets and
-                len(stargets) < 5 and
+                len(stargets) < (n / (2 * n.bit_length())) and
                 not na_in_stargets and
                 self.is_monotonic_increasing
         ):
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -196,6 +196,7 @@ def array_equivalent_object(
     right: npt.NDArray[np.object_],
 ) -> bool: ...
 def has_infs(arr: np.ndarray) -> bool: ...  # const floating[:]
+def has_only_ints_or_nan(arr: np.ndarray) -> bool: ...  # const floating[:]
 def get_reverse_indexer(
     indexer: np.ndarray,  # const intp_t[:]
     length: int,
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -532,6 +532,22 @@ def has_infs(floating[:] arr) -> bool:
     return ret
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def has_only_ints_or_nan(floating[:] arr) -> bool:
+    cdef:
+        floating val
+        intp_t i
+
+    for i in range(len(arr)):
+        val = arr[i]
+        if (val != val) or (val == <int64_t>val):
+            continue
+        else:
+            return False
+    return True
+
+
 def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
     cdef:
         Py_ssize_t i, n = len(indices)
diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build
@@ -3,7 +3,6 @@ py.extension_module(
     ['aggregations.pyx'],
     cython_args: ['-X always_allow_keywords=true'],
     include_directories: [inc_np, inc_pd],
-    dependencies: [py_dep],
     subdir: 'pandas/_libs/window',
     override_options : ['cython_language=cpp'],
     install: true
@@ -14,7 +13,6 @@ py.extension_module(
     ['indexers.pyx'],
     cython_args: ['-X always_allow_keywords=true'],
     include_directories: [inc_np, inc_pd],
-    dependencies: [py_dep],
     subdir: 'pandas/_libs/window',
     install: true
 )
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -15,6 +15,7 @@
     missing as libmissing,
 )
 from pandas._libs.arrays import NDArrayBacked
+from pandas._libs.lib import ensure_string_array
 from pandas.compat import pa_version_under7p0
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
@@ -224,7 +225,7 @@ def __from_arrow__(
             arr = np.array([], dtype=object)
         else:
             arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
-            arr = lib.convert_nans_to_NA(arr)
+            arr = ensure_string_array(arr, na_value=libmissing.NA)
         # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -554,3 +554,16 @@ def value_counts(self, dropna: bool = True):
         return Series(
             result._values.to_numpy(), index=result.index, name=result.name, copy=False
         )
+
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        if name in ["any", "all"]:
+            arr = pc.and_kleene(
+                pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "")
+            )
+            return ArrowExtensionArray(arr)._reduce(
+                name, skipna=skipna, keepdims=keepdims, **kwargs
+            )
+        else:
+            return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7279,7 +7279,7 @@ def value_counts(
             subset = self.columns.tolist()
 
         name = "proportion" if normalize else "count"
-        counts = self.groupby(subset, dropna=dropna).grouper.size()
+        counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size()
         counts.name = name
 
         if sort:
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -18,6 +18,7 @@
 from pandas._config import using_copy_on_write
 
 from pandas._libs import (
+    NaT,
     internals as libinternals,
     lib,
     writers,
@@ -59,7 +60,10 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
+    is_float_dtype,
+    is_integer_dtype,
     is_list_like,
+    is_scalar,
     is_string_dtype,
 )
 from pandas.core.dtypes.dtypes import (
@@ -453,6 +457,25 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
         and will receive the same block
         """
         new_dtype = find_result_type(self.values.dtype, other)
+
+        # In a future version of pandas, the default will be that
+        # setting `nan` into an integer series won't raise.
+        if (
+            is_scalar(other)
+            and is_integer_dtype(self.values.dtype)
+            and isna(other)
+            and other is not NaT
+        ):
+            warn_on_upcast = False
+        elif (
+            isinstance(other, np.ndarray)
+            and other.ndim == 1
+            and is_integer_dtype(self.values.dtype)
+            and is_float_dtype(other.dtype)
+            and lib.has_only_ints_or_nan(other)
+        ):
+            warn_on_upcast = False
+
         if warn_on_upcast:
             warnings.warn(
                 f"Setting an item of incompatible dtype is deprecated "
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -53,6 +53,7 @@
     ensure_object,
     is_bool,
     is_bool_dtype,
+    is_extension_array_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
@@ -1385,6 +1386,21 @@ def _maybe_coerce_merge_keys(self) -> None:
                 if lk.dtype.kind == rk.dtype.kind:
                     continue
 
+                if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype(
+                    rk.dtype
+                ):
+                    ct = find_common_type([lk.dtype, rk.dtype])
+                    if is_extension_array_dtype(ct):
+                        rk = ct.construct_array_type()._from_sequence(rk)  # type: ignore[union-attr]  # noqa: E501
+                    else:
+                        rk = rk.astype(ct)  # type: ignore[arg-type]
+                elif is_extension_array_dtype(rk.dtype):
+                    ct = find_common_type([lk.dtype, rk.dtype])
+                    if is_extension_array_dtype(ct):
+                        lk = ct.construct_array_type()._from_sequence(lk)  # type: ignore[union-attr]  # noqa: E501
+                    else:
+                        lk = lk.astype(ct)  # type: ignore[arg-type]
+
                 # check whether ints and floats
                 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
                     # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -158,7 +158,11 @@ def test_fillna_no_op_returns_copy(self, data):
 
 class TestReduce(base.BaseReduceTests):
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
-        return op_name in ["min", "max"]
+        return (
+            op_name in ["min", "max"]
+            or ser.dtype.storage == "pyarrow_numpy"  # type: ignore[union-attr]
+            and op_name in ("any", "all")
+        )
 
 
 class TestMethods(base.BaseMethodsTests):
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write):
     def test_setitem2(self):
         # dtype changing GH4204
         df = DataFrame([[0, 0]])
-        with tm.assert_produces_warning(
-            FutureWarning, match="Setting an item of incompatible dtype"
-        ):
-            df.iloc[0] = np.nan
+        df.iloc[0] = np.nan
         expected = DataFrame([[np.nan, np.nan]])
         tm.assert_frame_equal(df, expected)
 
         df = DataFrame([[0, 0]])
-        with tm.assert_produces_warning(
-            FutureWarning, match="Setting an item of incompatible dtype"
-        ):
-            df.loc[0] = np.nan
+        df.loc[0] = np.nan
         tm.assert_frame_equal(df, expected)
 
     def test_setitem_boolean(self, float_frame):
@@ -1579,9 +1573,7 @@ def test_setitem(self, uint64_frame):
         # With NaN: because uint64 has no NaN element,
         # the column should be cast to object.
         df2 = df.copy()
-        with tm.assert_produces_warning(
-            FutureWarning, match="Setting an item of incompatible dtype"
-        ):
+        with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
             df2.iloc[1, 1] = pd.NaT
             df2.iloc[1, 2] = pd.NaT
         result = df2["B"]
@@ -1901,19 +1893,19 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key):
 class TestSetitemValidation:
     # This is adapted from pandas/tests/arrays/masked/test_indexing.py
     # but checks for warnings instead of errors.
-    def _check_setitem_invalid(self, df, invalid, indexer):
+    def _check_setitem_invalid(self, df, invalid, indexer, warn):
         msg = "Setting an item of incompatible dtype is deprecated"
         msg = re.escape(msg)
 
         orig_df = df.copy()
 
         # iloc
-        with tm.assert_produces_warning(FutureWarning, match=msg):
+        with tm.assert_produces_warning(warn, match=msg):
             df.iloc[indexer, 0] = invalid
             df = orig_df.copy()
 
         # loc
-        with tm.assert_produces_warning(FutureWarning, match=msg):
+        with tm.assert_produces_warning(warn, match=msg):
             df.loc[indexer, "a"] = invalid
             df = orig_df.copy()
 
@@ -1934,16 +1926,20 @@ def _check_setitem_invalid(self, df, invalid, indexer):
     @pytest.mark.parametrize("indexer", _indexers)
     def test_setitem_validation_scalar_bool(self, invalid, indexer):
         df = DataFrame({"a": [True, False, False]}, dtype="bool")
-        self._check_setitem_invalid(df, invalid, indexer)
+        self._check_setitem_invalid(df, invalid, indexer, FutureWarning)
 
     @pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
     @pytest.mark.parametrize("indexer", _indexers)
     def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer):
         df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype)
-        self._check_setitem_invalid(df, invalid, indexer)
+        if isna(invalid) and invalid is not pd.NaT:
+            warn = None
+        else:
+            warn = FutureWarning
+        self._check_setitem_invalid(df, invalid, indexer, warn)
 
     @pytest.mark.parametrize("invalid", _invalid_scalars + [True])
     @pytest.mark.parametrize("indexer", _indexers)
     def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer):
         df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype)
-        self._check_setitem_invalid(df, invalid, indexer)
+        self._check_setitem_invalid(df, invalid, indexer, FutureWarning)
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
@@ -175,3 +175,17 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns):
     )
 
     tm.assert_series_equal(result, expected)
+
+
+def test_value_counts_categorical_future_warning():
+    # GH#54775
+    df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category")
+    result = df.value_counts()
+    expected = pd.Series(
+        1,
+        index=pd.MultiIndex.from_arrays(
+            [pd.Index([1, 2, 3], name="a", dtype="category")]
+        ),
+        name="count",
+    )
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected):
         start_data, expected_result, warn = expected
 
         start_dataframe = DataFrame({"foo": start_data})
-        with tm.assert_produces_warning(warn, match="incompatible dtype"):
-            start_dataframe.loc[0, ["foo"]] = None
+        start_dataframe.loc[0, ["foo"]] = None
 
         expected_dataframe = DataFrame({"foo": expected_result})
         tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected):
         start_data, expected_result, warn = expected
 
         start_dataframe = DataFrame({"foo": start_data})
-        with tm.assert_produces_warning(warn, match="incompatible dtype"):
-            start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
+        start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
 
         expected_dataframe = DataFrame({"foo": expected_result})
         tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected):
         start_data, expected_result, warn = expected
 
         start_dataframe = DataFrame({"foo": start_data})
-        with tm.assert_produces_warning(warn, match="incompatible dtype"):
-            start_dataframe.loc[
-                start_dataframe["foo"] == start_dataframe["foo"][0]
-            ] = None
+        start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
 
         expected_dataframe = DataFrame({"foo": expected_result})
         tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self):
                 "d": ["a", "b", "c"],
             }
         )
-        with tm.assert_produces_warning(
-            FutureWarning, match="item of incompatible dtype"
-        ):
-            start_dataframe.iloc[0] = None
+        start_dataframe.iloc[0] = None
 
         exp = DataFrame(
             {
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ Deprecations`
`153`	`153`
`154`	`154`	`Performance improvements`
`155`	`155`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`156`		`--`
	`156`	+- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
`157`	`157`	`-`
`158`	`158`
`159`	`159`	`.. ---------------------------------------------------------------------------`