pandas-dev
diff --git a/‎asv_bench/benchmarks/algorithms.py
Lines changed: 23 additions & 0 deletions b/‎asv_bench/benchmarks/algorithms.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎doc/source/reference/arrays.rst
Lines changed: 52 additions & 16 deletions b/‎doc/source/reference/arrays.rst
Lines changed: 52 additions & 16 deletions
diff --git a/‎doc/source/reference/general_functions.rst
Lines changed: 1 addition & 0 deletions b/‎doc/source/reference/general_functions.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/reference/groupby.rst
Lines changed: 7 additions & 0 deletions b/‎doc/source/reference/groupby.rst
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/source/reference/options.rst
Lines changed: 7 additions & 0 deletions b/‎doc/source/reference/options.rst
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/source/whatsnew/v1.6.0.rst
Lines changed: 5 additions & 2 deletions b/‎doc/source/whatsnew/v1.6.0.rst
Lines changed: 5 additions & 2 deletions
diff --git a/‎pandas/_libs/hashtable.pyi
Lines changed: 1 addition & 0 deletions b/‎pandas/_libs/hashtable.pyi
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/_libs/hashtable_func_helper.pxi.in
Lines changed: 60 additions & 31 deletions b/‎pandas/_libs/hashtable_func_helper.pxi.in
Lines changed: 60 additions & 31 deletions
diff --git a/‎pandas/_libs/lib.pyi
Lines changed: 1 addition & 1 deletion b/‎pandas/_libs/lib.pyi
Lines changed: 1 addition & 1 deletion
@@ -95,6 +95,29 @@ def time_duplicated(self, unique, keep, dtype):
         self.idx.duplicated(keep=keep)
 
 
+class DuplicatedMaskedArray:
+
+    params = [
+        [True, False],
+        ["first", "last", False],
+        ["Int64", "Float64"],
+    ]
+    param_names = ["unique", "keep", "dtype"]
+
+    def setup(self, unique, keep, dtype):
+        N = 10**5
+        data = pd.Series(np.arange(N), dtype=dtype)
+        data[list(range(1, N, 100))] = pd.NA
+        if not unique:
+            data = data.repeat(5)
+        self.ser = data
+        # cache is_unique
+        self.ser.is_unique
+
+    def time_duplicated(self, unique, keep, dtype):
+        self.ser.duplicated(keep=keep)
+
+
 class Hashing:
     def setup_cache(self):
         N = 10**5
 
@@ -19,20 +19,21 @@ objects contained with a :class:`Index`, :class:`Series`, or
 For some data types, pandas extends NumPy's type system. String aliases for these types
 can be found at :ref:`basics.dtypes`.
 
-=================== ========================= ============================= =============================
-Kind of Data        pandas Data Type          Scalar                        Array
-=================== ========================= ============================= =============================
-TZ-aware datetime   :class:`DatetimeTZDtype`  :class:`Timestamp`            :ref:`api.arrays.datetime`
-Timedeltas          (none)                    :class:`Timedelta`            :ref:`api.arrays.timedelta`
-Period (time spans) :class:`PeriodDtype`      :class:`Period`               :ref:`api.arrays.period`
-Intervals           :class:`IntervalDtype`    :class:`Interval`             :ref:`api.arrays.interval`
-Nullable Integer    :class:`Int64Dtype`, ...  (none)                        :ref:`api.arrays.integer_na`
-Categorical         :class:`CategoricalDtype` (none)                        :ref:`api.arrays.categorical`
-Sparse              :class:`SparseDtype`      (none)                        :ref:`api.arrays.sparse`
-Strings             :class:`StringDtype`      :class:`str`                  :ref:`api.arrays.string`
-Boolean (with NA)   :class:`BooleanDtype`     :class:`bool`                 :ref:`api.arrays.bool`
-PyArrow             :class:`ArrowDtype`       Python Scalars or :class:`NA` :ref:`api.arrays.arrow`
-=================== ========================= ============================= =============================
+=================== ========================== ============================= =============================
+Kind of Data        pandas Data Type           Scalar                        Array
+=================== ========================== ============================= =============================
+TZ-aware datetime   :class:`DatetimeTZDtype`   :class:`Timestamp`            :ref:`api.arrays.datetime`
+Timedeltas          (none)                     :class:`Timedelta`            :ref:`api.arrays.timedelta`
+Period (time spans) :class:`PeriodDtype`       :class:`Period`               :ref:`api.arrays.period`
+Intervals           :class:`IntervalDtype`     :class:`Interval`             :ref:`api.arrays.interval`
+Nullable Integer    :class:`Int64Dtype`, ...   (none)                        :ref:`api.arrays.integer_na`
+Nullable Float      :class:`Float64Dtype`, ... (none)                        :ref:`api.arrays.float_na`
+Categorical         :class:`CategoricalDtype`  (none)                        :ref:`api.arrays.categorical`
+Sparse              :class:`SparseDtype`       (none)                        :ref:`api.arrays.sparse`
+Strings             :class:`StringDtype`       :class:`str`                  :ref:`api.arrays.string`
+Nullable Boolean    :class:`BooleanDtype`      :class:`bool`                 :ref:`api.arrays.bool`
+PyArrow             :class:`ArrowDtype`        Python Scalars or :class:`NA` :ref:`api.arrays.arrow`
+=================== ========================== ============================= =============================
 
 pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
 The top-level :meth:`array` method can be used to create a new array, which may be
@@ -91,13 +92,20 @@ with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-
 or timezone-aware values.
 
 :class:`Timestamp`, a subclass of :class:`datetime.datetime`, is pandas'
-scalar type for timezone-naive or timezone-aware datetime data.
+scalar type for timezone-naive or timezone-aware datetime data. :class:`NaT`
+is the missing value for datetime data.
 
 .. autosummary::
    :toctree: api/
 
    Timestamp
 
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   NaT
+
 Properties
 ~~~~~~~~~~
 .. autosummary::
@@ -208,13 +216,20 @@ Timedeltas
 ----------
 
 NumPy can natively represent timedeltas. pandas provides :class:`Timedelta`
-for symmetry with :class:`Timestamp`.
+for symmetry with :class:`Timestamp`. :class:`NaT`
+is the missing value for timedelta data.
 
 .. autosummary::
    :toctree: api/
 
    Timedelta
 
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   NaT
+
 Properties
 ~~~~~~~~~~
 .. autosummary::
@@ -419,6 +434,26 @@ pandas provides this through :class:`arrays.IntegerArray`.
    UInt16Dtype
    UInt32Dtype
    UInt64Dtype
+   NA
+
+.. _api.arrays.float_na:
+
+Nullable float
+--------------
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   arrays.FloatingArray
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   Float32Dtype
+   Float64Dtype
+   NA
 
 .. _api.arrays.categorical:
 
@@ -555,6 +590,7 @@ with a bool :class:`numpy.ndarray`.
    :template: autosummary/class_without_autosummary.rst
 
    BooleanDtype
+   NA
 
 
 .. Dtype attributes which are manually listed in their docstrings: including
 
@@ -26,6 +26,7 @@ Data manipulations
    from_dummies
    factorize
    unique
+   lreshape
    wide_to_long
 
 Top-level missing data
 
@@ -27,6 +27,13 @@ Indexing, iteration
 
    Grouper
 
+Function application helper
+---------------------------
+.. autosummary::
+   :toctree: api/
+
+   NamedAgg
+
 .. currentmodule:: pandas.core.groupby
 
 Function application
 
@@ -19,3 +19,10 @@ Working with options
    get_option
    set_option
    option_context
+
+Numeric formatting
+------------------
+.. autosummary::
+   :toctree: api/
+
+   set_eng_float_format
@@ -31,6 +31,7 @@ Other enhancements
 - :meth:`.GroupBy.quantile` now preserving nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
 - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
+- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
 - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
 
 .. ---------------------------------------------------------------------------
@@ -104,10 +105,12 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
 - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
+- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
 - Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
 - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
 - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
+- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
 - Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
 -
 
@@ -124,7 +127,7 @@ Categorical
 
 Datetimelike
 ^^^^^^^^^^^^
--
+- Bug in :func:`pandas.infer_freq`, raising ``TypeError`` when inferred on :class:`RangeIndex` (:issue:`47084`)
 -
 
 Timedelta
@@ -171,7 +174,7 @@ Missing
 MultiIndex
 ^^^^^^^^^^
 - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`)
-- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`)
+- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`)
 - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
 -
 
 
@@ -183,6 +183,7 @@ class IntpHashTable(HashTable): ...
 def duplicated(
     values: np.ndarray,
     keep: Literal["last", "first", False] = ...,
+    mask: npt.NDArray[np.bool_] | None = ...,
 ) -> npt.NDArray[np.bool_]: ...
 def mode(
     values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ...
 
@@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
 @cython.wraparound(False)
 @cython.boundscheck(False)
 {{if dtype == 'object'}}
-cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
+cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
 {{else}}
-cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
+cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
 {{endif}}
     cdef:
         int ret = 0
@@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         {{else}}
         PyObject* value
         {{endif}}
-        Py_ssize_t i, n = len(values)
+        Py_ssize_t i, n = len(values), first_na = -1
         khiter_t k
         kh_{{ttype}}_t *table = kh_init_{{ttype}}()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+        bint seen_na = False, uses_mask = mask is not None
+        bint seen_multiple_na = False
 
     kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
 
@@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         {{endif}}
             for i in range(n - 1, -1, -1):
                 # equivalent: range(n)[::-1], which cython doesn't like in nogil
-                value = {{to_c_type}}(values[i])
-                kh_put_{{ttype}}(table, value, &ret)
-                out[i] = ret == 0
+                if uses_mask and mask[i]:
+                    if seen_na:
+                        out[i] = True
+                    else:
+                        out[i] = False
+                        seen_na = True
+                else:
+                    value = {{to_c_type}}(values[i])
+                    kh_put_{{ttype}}(table, value, &ret)
+                    out[i] = ret == 0
 
     elif keep == 'first':
         {{if dtype == 'object'}}
@@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         with nogil:
         {{endif}}
             for i in range(n):
-                value = {{to_c_type}}(values[i])
-                kh_put_{{ttype}}(table, value, &ret)
-                out[i] = ret == 0
+                if uses_mask and mask[i]:
+                    if seen_na:
+                        out[i] = True
+                    else:
+                        out[i] = False
+                        seen_na = True
+                else:
+                    value = {{to_c_type}}(values[i])
+                    kh_put_{{ttype}}(table, value, &ret)
+                    out[i] = ret == 0
 
     else:
         {{if dtype == 'object'}}
@@ -169,15 +185,28 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
         with nogil:
         {{endif}}
             for i in range(n):
-                value = {{to_c_type}}(values[i])
-                k = kh_get_{{ttype}}(table, value)
-                if k != table.n_buckets:
-                    out[table.vals[k]] = 1
-                    out[i] = 1
+                if uses_mask and mask[i]:
+                    if not seen_na:
+                        first_na = i
+                        seen_na = True
+                        out[i] = 0
+                    elif not seen_multiple_na:
+                        out[i] = 1
+                        out[first_na] = 1
+                        seen_multiple_na = True
+                    else:
+                        out[i] = 1
+
                 else:
-                    k = kh_put_{{ttype}}(table, value, &ret)
-                    table.vals[k] = i
-                    out[i] = 0
+                    value = {{to_c_type}}(values[i])
+                    k = kh_get_{{ttype}}(table, value)
+                    if k != table.n_buckets:
+                        out[table.vals[k]] = 1
+                        out[i] = 1
+                    else:
+                        k = kh_put_{{ttype}}(table, value, &ret)
+                        table.vals[k] = i
+                        out[i] = 0
 
     kh_destroy_{{ttype}}(table)
     return out
@@ -301,37 +330,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N
         raise TypeError(values.dtype)
 
 
-cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
+cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
     if htfunc_t is object:
-        return duplicated_object(values, keep)
+        return duplicated_object(values, keep, mask=mask)
 
     elif htfunc_t is int8_t:
-        return duplicated_int8(values, keep)
+        return duplicated_int8(values, keep, mask=mask)
     elif htfunc_t is int16_t:
-        return duplicated_int16(values, keep)
+        return duplicated_int16(values, keep, mask=mask)
     elif htfunc_t is int32_t:
-        return duplicated_int32(values, keep)
+        return duplicated_int32(values, keep, mask=mask)
     elif htfunc_t is int64_t:
-        return duplicated_int64(values, keep)
+        return duplicated_int64(values, keep, mask=mask)
 
     elif htfunc_t is uint8_t:
-        return duplicated_uint8(values, keep)
+        return duplicated_uint8(values, keep, mask=mask)
     elif htfunc_t is uint16_t:
-        return duplicated_uint16(values, keep)
+        return duplicated_uint16(values, keep, mask=mask)
     elif htfunc_t is uint32_t:
-        return duplicated_uint32(values, keep)
+        return duplicated_uint32(values, keep, mask=mask)
     elif htfunc_t is uint64_t:
-        return duplicated_uint64(values, keep)
+        return duplicated_uint64(values, keep, mask=mask)
 
     elif htfunc_t is float64_t:
-        return duplicated_float64(values, keep)
+        return duplicated_float64(values, keep, mask=mask)
     elif htfunc_t is float32_t:
-        return duplicated_float32(values, keep)
+        return duplicated_float32(values, keep, mask=mask)
 
     elif htfunc_t is complex128_t:
-        return duplicated_complex128(values, keep)
+        return duplicated_complex128(values, keep, mask=mask)
     elif htfunc_t is complex64_t:
-        return duplicated_complex64(values, keep)
+        return duplicated_complex64(values, keep, mask=mask)
 
     else:
         raise TypeError(values.dtype)
 
@@ -59,7 +59,7 @@ def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
 def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
 def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
 def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
-def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ...
+def fast_unique_multiple(left: np.ndarray, right: np.ndarray) -> list: ...
 def map_infer(
     arr: np.ndarray,
     f: Callable[[Any], Any],