pandas-dev
diff --git a/‎.pre-commit-config.yaml
Lines changed: 12 additions & 15 deletions b/‎.pre-commit-config.yaml
Lines changed: 12 additions & 15 deletions
diff --git a/‎ci/code_checks.sh
Lines changed: 0 additions & 3 deletions b/‎ci/code_checks.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎doc/source/development/contributing.rst
Lines changed: 20 additions & 0 deletions b/‎doc/source/development/contributing.rst
Lines changed: 20 additions & 0 deletions
diff --git a/‎doc/source/development/contributing_codebase.rst
Lines changed: 6 additions & 0 deletions b/‎doc/source/development/contributing_codebase.rst
Lines changed: 6 additions & 0 deletions
diff --git a/‎doc/source/development/contributing_environment.rst
Lines changed: 2 additions & 0 deletions b/‎doc/source/development/contributing_environment.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎doc/source/user_guide/io.rst
Lines changed: 3 additions & 3 deletions b/‎doc/source/user_guide/io.rst
Lines changed: 3 additions & 3 deletions
diff --git a/‎doc/source/whatsnew/v2.0.0.rst
Lines changed: 3 additions & 2 deletions b/‎doc/source/whatsnew/v2.0.0.rst
Lines changed: 3 additions & 2 deletions
diff --git a/‎pandas/_config/config.py
Lines changed: 1 addition & 0 deletions b/‎pandas/_config/config.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/core/arrays/arrow/array.py
Lines changed: 20 additions & 5 deletions b/‎pandas/core/arrays/arrow/array.py
Lines changed: 20 additions & 5 deletions
diff --git a/‎pandas/core/arrays/arrow/dtype.py
Lines changed: 44 additions & 2 deletions b/‎pandas/core/arrays/arrow/dtype.py
Lines changed: 44 additions & 2 deletions
diff --git a/‎pandas/core/frame.py
Lines changed: 6 additions & 0 deletions b/‎pandas/core/frame.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎pandas/core/generic.py
Lines changed: 1 addition & 1 deletion b/‎pandas/core/generic.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/core/groupby/generic.py
Lines changed: 8 additions & 8 deletions b/‎pandas/core/groupby/generic.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎pandas/core/indexes/base.py
Lines changed: 11 additions & 2 deletions b/‎pandas/core/indexes/base.py
Lines changed: 11 additions & 2 deletions
@@ -15,15 +15,22 @@ default_stages: [
 ci:
     autofix_prs: false
 repos:
+-   repo: local
+    hooks:
+    # NOTE: we make `black` a local hook because if it's installed from
+    # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc
+    -   id: black
+        name: black
+        description: "Black: The uncompromising Python code formatter"
+        entry: black
+        language: python
+        require_serial: true
+        types_or: [python, pyi]
+        additional_dependencies: [black==23.1.0]
 -   repo: https://github.com/charliermarsh/ruff-pre-commit
     rev: v0.0.244
     hooks:
     -   id: ruff
--   repo: https://github.com/MarcoGorelli/absolufy-imports
-    rev: v0.3.1
-    hooks:
-    -   id: absolufy-imports
-        files: ^pandas/
 -   repo: https://github.com/jendrikseipp/vulture
     rev: 'v2.7'
     hooks:
@@ -116,16 +123,6 @@ repos:
     - id: sphinx-lint
 -   repo: local
     hooks:
-    # NOTE: we make `black` a local hook because if it's installed from
-    # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc
-    -   id: black
-        name: black
-        description: "Black: The uncompromising Python code formatter"
-        entry: black
-        language: python
-        require_serial: true
-        types_or: [python, pyi]
-        additional_dependencies: [black==23.1.0]
     -   id: pyright
         # note: assumes python env is setup and activated
         name: pyright
 
@@ -591,14 +591,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.api.types.is_timedelta64_ns_dtype \
         pandas.api.types.is_unsigned_integer_dtype \
         pandas.core.groupby.DataFrameGroupBy.take \
-        pandas.core.groupby.SeriesGroupBy.take \
         pandas.io.formats.style.Styler.concat \
         pandas.io.formats.style.Styler.export \
         pandas.io.formats.style.Styler.set_td_classes \
         pandas.io.formats.style.Styler.use \
         pandas.io.json.build_table_schema \
-        pandas.merge_ordered \
-        pandas.option_context \
         pandas.plotting.andrews_curves \
         pandas.plotting.autocorrelation_plot \
         pandas.plotting.lag_plot \
 
@@ -331,6 +331,26 @@ To automatically fix formatting errors on each commit you make, you can
 set up pre-commit yourself. First, create a Python :ref:`environment
 <contributing_environment>` and then set up :ref:`pre-commit <contributing.pre-commit>`.
 
+.. _contributing.update-dev:
+
+Updating the development environment
+------------------------------------
+
+After updating your branch to merge in main from upstream, you may need to update
+your development environment to reflect any changes to the various packages that
+are used during development.
+
+If using :ref:`mamba <contributing.mamba>`, do::
+
+    mamba deactivate
+    mamba env update -f environment.yml
+    mamba activate pandas-dev
+
+If using :ref:`pip <contributing.pip>` , do::
+
+    # activate the virtual environment based on your platform
+    pythom -m pip install --upgrade -r requirements-dev.txt
+
 Tips for a successful pull request
 ==================================
 
 
@@ -89,6 +89,12 @@ without needing to have done ``pre-commit install`` beforehand.
     you may run into issues if you're using conda. To solve this, you can downgrade
     ``virtualenv`` to version ``20.0.33``.
 
+.. note::
+
+    If you have recently merged in main from the upstream branch, some of the
+    dependencies used by ``pre-commit`` may have changed.  Make sure to
+    :ref:`update your development environment <contributing.update-dev>`.
+
 Optional dependencies
 ---------------------
 
 
@@ -95,6 +95,8 @@ Option 1: using mamba (recommended)
    mamba env create --file environment.yml
    mamba activate pandas-dev
 
+.. _contributing.pip:
+
 Option 2: using pip
 ~~~~~~~~~~~~~~~~~~~
 
 
@@ -294,9 +294,9 @@ date_parser : function, default ``None``
   .. deprecated:: 2.0.0
    Use ``date_format`` instead, or read in as ``object`` and then apply
    :func:`to_datetime` as-needed.
-date_format : str, default ``None``
+date_format : str or dict of column -> format, default ``None``
    If used in conjunction with ``parse_dates``, will parse dates according to this
-   format. For anything more complex (e.g. different formats for different columns),
+   format. For anything more complex,
    please read in as ``object`` and then apply :func:`to_datetime` as-needed.
 
     .. versionadded:: 2.0.0
@@ -912,7 +912,7 @@ Finally, the parser allows you to specify a custom ``date_format``.
 Performance-wise, you should try these methods of parsing dates in order:
 
 1. If you know the format, use ``date_format``, e.g.:
-   ``date_format="%d/%m/%Y"``.
+   ``date_format="%d/%m/%Y"`` or ``date_format={column_name: "%d/%m/%Y"}``.
 
 2. If you different formats for different columns, or want to pass any extra options (such
    as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and
 
@@ -222,6 +222,7 @@ Copy-on-Write improvements
   - :meth:`DataFrame.to_timestamp` / :meth:`Series.to_timestamp`
   - :meth:`DataFrame.to_period` / :meth:`Series.to_period`
   - :meth:`DataFrame.truncate`
+  - :meth:`DataFrame.iterrows`
   - :meth:`DataFrame.tz_convert` / :meth:`Series.tz_localize`
   - :meth:`DataFrame.fillna` / :meth:`Series.fillna`
   - :meth:`DataFrame.interpolate` / :meth:`Series.interpolate`
@@ -1107,7 +1108,7 @@ Performance improvements
 - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
 - Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`)
 - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
-- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`)
+- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`, :issue:`51365`)
 - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
 - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
 - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`)
@@ -1125,7 +1126,7 @@ Performance improvements
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`)
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` (:issue:`50248`, :issue:`50632`)
 - Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`)
-- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`)
+- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`, :issue:`51227`)
 - Performance improvement when parsing strings to :class:`BooleanDtype` (:issue:`50613`)
 - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
 - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
 
@@ -424,6 +424,7 @@ class option_context(ContextDecorator):
 
     Examples
     --------
+    >>> from pandas import option_context
     >>> with option_context('display.max_rows', 10, 'display.max_columns', 5):
     ...     pass
     """
 
@@ -510,6 +510,18 @@ def __len__(self) -> int:
         """
         return len(self._data)
 
+    def __contains__(self, key) -> bool:
+        # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
+        if isna(key) and key is not self.dtype.na_value:
+            if self.dtype.kind == "f" and lib.is_float(key) and isna(key):
+                return pc.any(pc.is_nan(self._data)).as_py()
+
+            # e.g. date or timestamp types we do not allow None here to match pd.NA
+            return False
+            # TODO: maybe complex? object?
+
+        return bool(super().__contains__(key))
+
     @property
     def _hasna(self) -> bool:
         return self._data.null_count > 0
@@ -868,12 +880,15 @@ def to_numpy(
             na_value = self.dtype.na_value
 
         pa_type = self._data.type
-        if (
-            is_object_dtype(dtype)
-            or pa.types.is_timestamp(pa_type)
-            or pa.types.is_duration(pa_type)
-        ):
+        if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
+            # temporal types with units and/or timezones currently
+            #  require pandas/python scalars to pass all tests
+            # TODO: improve performance (this is slow)
             result = np.array(list(self), dtype=dtype)
+        elif is_object_dtype(dtype) and self._hasna:
+            result = np.empty(len(self), dtype=object)
+            mask = ~self.isna()
+            result[mask] = np.asarray(self[mask]._data)
         else:
             result = np.asarray(self._data, dtype=dtype)
             if copy or self._hasna:
 
@@ -1,9 +1,20 @@
 from __future__ import annotations
 
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+)
+from decimal import Decimal
 import re
 
 import numpy as np
 
+from pandas._libs.tslibs import (
+    Timedelta,
+    Timestamp,
+)
 from pandas._typing import (
     TYPE_CHECKING,
     DtypeObj,
@@ -88,9 +99,40 @@ def __repr__(self) -> str:
     @property
     def type(self):
         """
-        Returns pyarrow.DataType.
+        Returns associated scalar type.
         """
-        return type(self.pyarrow_dtype)
+        pa_type = self.pyarrow_dtype
+        if pa.types.is_integer(pa_type):
+            return int
+        elif pa.types.is_floating(pa_type):
+            return float
+        elif pa.types.is_string(pa_type):
+            return str
+        elif pa.types.is_binary(pa_type):
+            return bytes
+        elif pa.types.is_boolean(pa_type):
+            return bool
+        elif pa.types.is_duration(pa_type):
+            if pa_type.unit == "ns":
+                return Timedelta
+            else:
+                return timedelta
+        elif pa.types.is_timestamp(pa_type):
+            if pa_type.unit == "ns":
+                return Timestamp
+            else:
+                return datetime
+        elif pa.types.is_date(pa_type):
+            return date
+        elif pa.types.is_time(pa_type):
+            return time
+        elif pa.types.is_decimal(pa_type):
+            return Decimal
+        elif pa.types.is_null(pa_type):
+            # TODO: None? pd.NA? pa.null?
+            return type(pa_type)
+        else:
+            raise NotImplementedError(pa_type)
 
     @property
     def name(self) -> str:  # type: ignore[override]
 
@@ -1392,8 +1392,14 @@ def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
         """
         columns = self.columns
         klass = self._constructor_sliced
+        using_cow = using_copy_on_write()
         for k, v in zip(self.index, self.values):
             s = klass(v, index=columns, name=k).__finalize__(self)
+            if using_cow and self._mgr.is_single_block:
+                s._mgr.blocks[0].refs = self._mgr.blocks[0].refs  # type: ignore[union-attr]  # noqa
+                s._mgr.blocks[0].refs.add_reference(  # type: ignore[union-attr]
+                    s._mgr.blocks[0]  # type: ignore[arg-type, union-attr]
+                )
             yield k, s
 
     def itertuples(
 
@@ -3329,7 +3329,7 @@ def to_latex(
         >>> print(df.to_latex(index=False,
         ...                   formatters={"name": str.upper},
         ...                   float_format="{:.1f}".format,
-        ... )  # doctest: +SKIP
+        ... ))  # doctest: +SKIP
         \begin{tabular}{lrr}
         \toprule
         name & age & height \\
 
@@ -933,7 +933,7 @@ def take(
 
         Examples
         --------
-        >>> df = DataFrame([('falcon', 'bird', 389.0),
+        >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
         ...                    ('parrot', 'bird', 24.0),
         ...                    ('lion', 'mammal', 80.5),
         ...                    ('monkey', 'mammal', np.nan),
@@ -2358,7 +2358,7 @@ def take(
 
         Examples
         --------
-        >>> df = DataFrame([('falcon', 'bird', 389.0),
+        >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
         ...                    ('parrot', 'bird', 24.0),
         ...                    ('lion', 'mammal', 80.5),
         ...                    ('monkey', 'mammal', np.nan),
@@ -2387,15 +2387,15 @@ def take(
         2 2    lion  mammal       80.5
           1  monkey  mammal        NaN
 
-        The order of the specified indices influnces the order in the result.
+        The order of the specified indices influences the order in the result.
         Here, the order is swapped from the previous example.
 
-        >>> gb.take([0, 1])
+        >>> gb.take([1, 0])
                name   class  max_speed
-        1 4  falcon    bird      389.0
-          3  parrot    bird       24.0
-        2 2    lion  mammal       80.5
-          1  monkey  mammal        NaN
+        1 3  parrot    bird       24.0
+          4  falcon    bird      389.0
+        2 1  monkey  mammal        NaN
+          2    lion  mammal       80.5
 
         Take elements at indices 1 and 2 along the axis 1 (column selection).
 
 
@@ -144,6 +144,7 @@
     validate_putmask,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
     Categorical,
     ExtensionArray,
@@ -4850,8 +4851,10 @@ def _can_use_libjoin(self) -> bool:
         if type(self) is Index:
             # excludes EAs, but include masks, we get here with monotonic
             # values only, meaning no NA
-            return isinstance(self.dtype, np.dtype) or isinstance(
-                self.values, BaseMaskedArray
+            return (
+                isinstance(self.dtype, np.dtype)
+                or isinstance(self.values, BaseMaskedArray)
+                or isinstance(self._values, ArrowExtensionArray)
             )
         return not is_interval_dtype(self.dtype)
 
@@ -4942,6 +4945,10 @@ def _get_join_target(self) -> ArrayLike:
         if isinstance(self._values, BaseMaskedArray):
             # This is only used if our array is monotonic, so no NAs present
             return self._values._data
+        elif isinstance(self._values, ArrowExtensionArray):
+            # This is only used if our array is monotonic, so no missing values
+            # present
+            return self._values.to_numpy()
         return self._get_engine_target()
 
     def _from_join_target(self, result: np.ndarray) -> ArrayLike:
@@ -4951,6 +4958,8 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
         """
         if isinstance(self.values, BaseMaskedArray):
             return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
+        elif isinstance(self.values, ArrowExtensionArray):
+            return type(self.values)._from_sequence(result)
         return result
 
     @doc(IndexOpsMixin._memory_usage)