pandas-dev
diff --git a/‎.github/workflows/wheels.yml
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/wheels.yml
Lines changed: 4 additions & 2 deletions
diff --git a/‎asv_bench/benchmarks/indexing.py
Lines changed: 32 additions & 0 deletions b/‎asv_bench/benchmarks/indexing.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎asv_bench/benchmarks/indexing_engines.py
Lines changed: 81 additions & 1 deletion b/‎asv_bench/benchmarks/indexing_engines.py
Lines changed: 81 additions & 1 deletion
diff --git a/‎doc/source/development/contributing_docstring.rst
Lines changed: 0 additions & 2 deletions b/‎doc/source/development/contributing_docstring.rst
Lines changed: 0 additions & 2 deletions
diff --git a/‎doc/source/development/maintaining.rst
Lines changed: 2 additions & 1 deletion b/‎doc/source/development/maintaining.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/source/whatsnew/v1.1.4.rst
Lines changed: 1 addition & 1 deletion b/‎doc/source/whatsnew/v1.1.4.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/source/whatsnew/v2.0.0.rst
Lines changed: 8 additions & 1 deletion b/‎doc/source/whatsnew/v2.0.0.rst
Lines changed: 8 additions & 1 deletion
diff --git a/‎pandas/_libs/hashtable.pyi
Lines changed: 2 additions & 0 deletions b/‎pandas/_libs/hashtable.pyi
Lines changed: 2 additions & 0 deletions
diff --git a/‎pandas/_libs/hashtable_class_helper.pxi.in
Lines changed: 8 additions & 4 deletions b/‎pandas/_libs/hashtable_class_helper.pxi.in
Lines changed: 8 additions & 4 deletions
diff --git a/‎pandas/_libs/index.pyi
Lines changed: 20 additions & 4 deletions b/‎pandas/_libs/index.pyi
Lines changed: 20 additions & 4 deletions
@@ -86,7 +86,8 @@ jobs:
           activate-environment: test
           channels: conda-forge, anaconda
           channel-priority: true
-          mamba-version: "*"
+          # mamba fails to solve, also we really don't need this since we're just installing python
+          # mamba-version: "*"
 
       - name: Test wheels (Windows 64-bit only)
         if: ${{ matrix.buildplat[1] == 'win_amd64' }}
@@ -154,7 +155,8 @@ jobs:
           python-version: '3.8'
           channels: conda-forge
           channel-priority: true
-          mamba-version: "*"
+          # mamba fails to solve, also we really don't need this since we're just installing python
+          # mamba-version: "*"
 
       - name: Build sdist
         run: |
 
@@ -8,6 +8,7 @@
 import numpy as np
 
 from pandas import (
+    NA,
     CategoricalIndex,
     DataFrame,
     Index,
@@ -83,6 +84,37 @@ def time_loc_slice(self, index, index_structure):
         self.data.loc[:800000]
 
 
+class NumericMaskedIndexing:
+    monotonic_list = list(range(10**6))
+    non_monotonic_list = (
+        list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
+    )
+
+    params = [
+        ("Int64", "UInt64", "Float64"),
+        (True, False),
+    ]
+    param_names = ["dtype", "monotonic"]
+
+    def setup(self, dtype, monotonic):
+
+        indices = {
+            True: Index(self.monotonic_list, dtype=dtype),
+            False: Index(self.non_monotonic_list, dtype=dtype).append(
+                Index([NA], dtype=dtype)
+            ),
+        }
+        self.data = indices[monotonic]
+        self.indexer = np.arange(300, 1_000)
+        self.data_dups = self.data.append(self.data)
+
+    def time_get_indexer(self, dtype, monotonic):
+        self.data.get_indexer(self.indexer)
+
+    def time_get_indexer_dups(self, dtype, monotonic):
+        self.data.get_indexer_for(self.indexer)
+
+
 class NonNumericSeriesIndexing:
 
     params = [
 
@@ -1,5 +1,8 @@
 """
-Benchmarks in this file depend exclusively on code in _libs/
+Benchmarks in this file depend mostly on code in _libs/
+
+We have to created masked arrays to test the masked engine though. The
+array is unpacked on the Cython level.
 
 If a PR does not edit anything in _libs, it is very unlikely that benchmarks
 in this file will be affected.
@@ -9,6 +12,8 @@
 
 from pandas._libs import index as libindex
 
+from pandas.core.arrays import BaseMaskedArray
+
 
 def _get_numeric_engines():
     engine_names = [
@@ -30,6 +35,26 @@ def _get_numeric_engines():
     ]
 
 
+def _get_masked_engines():
+    engine_names = [
+        ("MaskedInt64Engine", "Int64"),
+        ("MaskedInt32Engine", "Int32"),
+        ("MaskedInt16Engine", "Int16"),
+        ("MaskedInt8Engine", "Int8"),
+        ("MaskedUInt64Engine", "UInt64"),
+        ("MaskedUInt32Engine", "UInt32"),
+        ("MaskedUInt16engine", "UInt16"),
+        ("MaskedUInt8Engine", "UInt8"),
+        ("MaskedFloat64Engine", "Float64"),
+        ("MaskedFloat32Engine", "Float32"),
+    ]
+    return [
+        (getattr(libindex, engine_name), dtype)
+        for engine_name, dtype in engine_names
+        if hasattr(libindex, engine_name)
+    ]
+
+
 class NumericEngineIndexing:
 
     params = [
@@ -80,6 +105,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
         self.data.get_loc(self.key_middle)
 
 
+class MaskedNumericEngineIndexing:
+
+    params = [
+        _get_masked_engines(),
+        ["monotonic_incr", "monotonic_decr", "non_monotonic"],
+        [True, False],
+        [10**5, 2 * 10**6],  # 2e6 is above SIZE_CUTOFF
+    ]
+    param_names = ["engine_and_dtype", "index_type", "unique", "N"]
+
+    def setup(self, engine_and_dtype, index_type, unique, N):
+        engine, dtype = engine_and_dtype
+
+        if index_type == "monotonic_incr":
+            if unique:
+                arr = np.arange(N * 3, dtype=dtype.lower())
+            else:
+                values = list([1] * N + [2] * N + [3] * N)
+                arr = np.array(values, dtype=dtype.lower())
+            mask = np.zeros(N * 3, dtype=np.bool_)
+        elif index_type == "monotonic_decr":
+            if unique:
+                arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
+            else:
+                values = list([1] * N + [2] * N + [3] * N)
+                arr = np.array(values, dtype=dtype.lower())[::-1]
+            mask = np.zeros(N * 3, dtype=np.bool_)
+        else:
+            assert index_type == "non_monotonic"
+            if unique:
+                arr = np.zeros(N * 3, dtype=dtype.lower())
+                arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
+                arr[N:] = np.arange(N * 2, dtype=dtype.lower())
+
+            else:
+                arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
+            mask = np.zeros(N * 3, dtype=np.bool_)
+            mask[-1] = True
+
+        self.data = engine(BaseMaskedArray(arr, mask))
+        # code belows avoids populating the mapping etc. while timing.
+        self.data.get_loc(2)
+
+        self.key_middle = arr[len(arr) // 2]
+        self.key_early = arr[2]
+
+    def time_get_loc(self, engine_and_dtype, index_type, unique, N):
+        self.data.get_loc(self.key_early)
+
+    def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
+        # searchsorted performance may be different near the middle of a range
+        #  vs near an endpoint
+        self.data.get_loc(self.key_middle)
+
+
 class ObjectEngineIndexing:
 
     params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
 
@@ -67,8 +67,6 @@ case of pandas, the NumPy docstring convention is followed. These conventions ar
 explained in this document:
 
 * `numpydoc docstring guide <https://numpydoc.readthedocs.io/en/latest/format.html>`_
-  (which is based in the original `Guide to NumPy/SciPy documentation
-  <https://github.com/numpy/numpy/blob/main/doc/HOWTO_DOCUMENT.rst.txt>`_)
 
 numpydoc is a Sphinx extension to support the NumPy docstring convention.
 
 
@@ -465,7 +465,8 @@ which will be triggered when the tag is pushed.
 
 7. Download all wheels from the Anaconda repository where MacPython uploads them:
    https://anaconda.org/multibuild-wheels-staging/pandas/files?version=<version>
-   to the ``dist/`` directory in the local pandas copy.
+   to the ``dist/`` directory in the local pandas copy. You can use the script
+   ``scripts/download_wheels.sh`` to download all wheels at once.
 
 8. Upload wheels to PyPI:
 
 
@@ -31,7 +31,7 @@ Fixed regressions
 - Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`)
 - Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`)
 - Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`)
-- Fixed regression in inplace arithmetic operation on a Series not updating the parent DataFrame (:issue:`36373`)
+- Fixed regression in inplace arithmetic operation (`+=`) on a Series not updating the parent DataFrame/Series (:issue:`36373`)
 
 .. ---------------------------------------------------------------------------
 
 
@@ -630,13 +630,16 @@ Other API changes
 Deprecations
 ~~~~~~~~~~~~
 - Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
+- Deprecated behavior of :func:`to_datetime` with ``unit`` when parsing strings, in a future version these will be parsed as datetimes (matching unit-less behavior) instead of cast to floats. To retain the old behavior, cast strings to numeric types before calling :func:`to_datetime` (:issue:`50735`)
 - Deprecated :func:`pandas.io.sql.execute` (:issue:`50185`)
 - :meth:`Index.is_boolean` has been deprecated. Use :func:`pandas.api.types.is_bool_dtype` instead (:issue:`50042`)
 - :meth:`Index.is_integer` has been deprecated. Use :func:`pandas.api.types.is_integer_dtype` instead (:issue:`50042`)
 - :meth:`Index.is_floating` has been deprecated. Use :func:`pandas.api.types.is_float_dtype` instead (:issue:`50042`)
 - :meth:`Index.holds_integer` has been deprecated. Use :func:`pandas.api.types.infer_dtype` instead (:issue:`50243`)
 - :meth:`Index.is_categorical` has been deprecated. Use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`50042`)
+- :meth:`Index.is_object` has been deprecated. Use :func:`pandas.api.types.is_object_dtype` instead (:issue:`50042`)
 - :meth:`Index.is_interval` has been deprecated. Use :func:`pandas.api.types.is_intterval_dtype` instead (:issue:`50042`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.prior_deprecations:
@@ -904,6 +907,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
 - Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
+- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`)
 - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
 - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
 - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
@@ -1021,7 +1025,7 @@ Conversion
 
 Strings
 ^^^^^^^
-- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`)
+- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` or :class:`ArrowDtype` with ``pyarrow.string()`` (:issue:`15585`)
 - Bug in converting string dtypes to "datetime64[ns]" or "timedelta64[ns]" incorrectly raising ``TypeError`` (:issue:`36153`)
 -
 
@@ -1100,6 +1104,8 @@ Period
 - Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`)
 - Bug in adding a :class:`Period` object to an array of :class:`DateOffset` objects incorrectly raising ``TypeError`` (:issue:`50162`)
 - Bug in :class:`Period` where passing a string with finer resolution than nanosecond would result in a ``KeyError`` instead of dropping the extra precision (:issue:`50417`)
+- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`50803`)
+-
 
 Plotting
 ^^^^^^^^
@@ -1129,6 +1135,7 @@ Groupby/resample/rolling
 - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
 - Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
 - Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
+- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
 -
 
 Reshaping
 
@@ -165,10 +165,12 @@ class HashTable:
     def map_locations(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
+        mask: npt.NDArray[np.bool_] | None = ...,
     ) -> None: ...
     def lookup(
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
+        mask: npt.NDArray[np.bool_] | None = ...,
     ) -> npt.NDArray[np.intp]: ...
     def get_labels(
         self,
 
@@ -1005,8 +1005,9 @@ cdef class StringHashTable(HashTable):
         return labels
 
     @cython.boundscheck(False)
-    def lookup(self, ndarray[object] values) -> ndarray:
+    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
         # -> np.ndarray[np.intp]
+        # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -1041,7 +1042,8 @@ cdef class StringHashTable(HashTable):
         return np.asarray(locs)
 
     @cython.boundscheck(False)
-    def map_locations(self, ndarray[object] values) -> None:
+    def map_locations(self, ndarray[object] values, object mask = None) -> None:
+        # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -1314,7 +1316,8 @@ cdef class PyObjectHashTable(HashTable):
         else:
             raise KeyError(key)
 
-    def map_locations(self, ndarray[object] values) -> None:
+    def map_locations(self, ndarray[object] values, object mask = None) -> None:
+        # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
@@ -1328,8 +1331,9 @@ cdef class PyObjectHashTable(HashTable):
             k = kh_put_pymap(self.table, <PyObject*>val, &ret)
             self.table.vals[k] = i
 
-    def lookup(self, ndarray[object] values) -> ndarray:
+    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
         # -> np.ndarray[np.intp]
+        # mask not yet implemented
         cdef:
             Py_ssize_t i, n = len(values)
             int ret = 0
 
@@ -29,6 +29,12 @@ class IndexEngine:
         targets: np.ndarray,
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
 
+class MaskedIndexEngine(IndexEngine):
+    def __init__(self, values: object) -> None: ...
+    def get_indexer_non_unique(
+        self, targets: object
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+
 class Float64Engine(IndexEngine): ...
 class Float32Engine(IndexEngine): ...
 class Complex128Engine(IndexEngine): ...
@@ -46,6 +52,19 @@ class DatetimeEngine(Int64Engine): ...
 class TimedeltaEngine(DatetimeEngine): ...
 class PeriodEngine(Int64Engine): ...
 class BoolEngine(UInt8Engine): ...
+class MaskedBoolEngine(MaskedUInt8Engine): ...
+class MaskedFloat64Engine(MaskedIndexEngine): ...
+class MaskedFloat32Engine(MaskedIndexEngine): ...
+class MaskedComplex128Engine(MaskedIndexEngine): ...
+class MaskedComplex64Engine(MaskedIndexEngine): ...
+class MaskedInt64Engine(MaskedIndexEngine): ...
+class MaskedInt32Engine(MaskedIndexEngine): ...
+class MaskedInt16Engine(MaskedIndexEngine): ...
+class MaskedInt8Engine(MaskedIndexEngine): ...
+class MaskedUInt64Engine(MaskedIndexEngine): ...
+class MaskedUInt32Engine(MaskedIndexEngine): ...
+class MaskedUInt16Engine(MaskedIndexEngine): ...
+class MaskedUInt8Engine(MaskedIndexEngine): ...
 
 class BaseMultiIndexCodesEngine:
     levels: list[np.ndarray]
@@ -57,10 +76,7 @@ class BaseMultiIndexCodesEngine:
         labels: list[np.ndarray],  # all entries integer-dtyped
         offsets: np.ndarray,  # np.ndarray[np.uint64, ndim=1]
     ) -> None: ...
-    def get_indexer(
-        self,
-        target: npt.NDArray[np.object_],
-    ) -> npt.NDArray[np.intp]: ...
+    def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
     def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
     def get_indexer_with_fill(
         self,