diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 09bfda1755e03..5308c98e96937 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-minimum_pre_commit_version: 2.15.0
+minimum_pre_commit_version: 4.0.0
exclude: ^LICENSES/|\.(html|csv|svg)$
# reserve "manual" for relatively slow hooks which we still want to run in CI
default_stages: [
@@ -19,13 +19,13 @@ ci:
skip: [pyright, mypy]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.9.9
+ rev: v0.11.4
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
exclude: ^pandas/tests/frame/test_query_eval.py
- id: ruff
- # TODO: remove autofixe-only rules when they are checked by ruff
+ # TODO: remove autofix only rules when they are checked by ruff
name: ruff-selected-autofixes
alias: ruff-selected-autofixes
files: ^pandas
@@ -34,7 +34,7 @@ repos:
- id: ruff-format
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
- repo: https://github.com/jendrikseipp/vulture
- rev: 'v2.14'
+ rev: v2.14
hooks:
- id: vulture
entry: python scripts/run_vulture.py
@@ -95,14 +95,14 @@ repos:
- id: sphinx-lint
args: ["--enable", "all", "--disable", "line-too-long"]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v19.1.7
+ rev: v20.1.0
hooks:
- id: clang-format
files: ^pandas/_libs/src|^pandas/_libs/include
args: [-i]
types_or: [c, c++]
- repo: https://github.com/trim21/pre-commit-mirror-meson
- rev: v1.7.0
+ rev: v1.7.2
hooks:
- id: meson-fmt
args: ['--inplace']
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 6a2ab24df26fe..cd7851acae3f2 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -517,7 +517,7 @@ def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
- self.df2 = DataFrame({i: self.s for i in range(1028)})
+ self.df2 = DataFrame(dict.fromkeys(range(1028), self.s))
self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2c32eb4f0c584..a0d23aa0478d2 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -72,9 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-i "pandas.Period.freq GL08" \
-i "pandas.Period.ordinal GL08" \
- -i "pandas.Timestamp.max PR02" \
- -i "pandas.Timestamp.min PR02" \
- -i "pandas.Timestamp.resolution PR02" \
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-i "pandas.core.resample.Resampler.quantile PR01,PR07" \
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index c7c72828db481..2aadf42a510eb 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -52,7 +52,7 @@ dependencies:
- pyxlsb=1.0.10
- s3fs=2022.11.0
- scipy=1.10.0
- - sqlalchemy=2.0.0
+ - sqlalchemy=1.4.36
- tabulate=0.9.0
- xarray=2022.12.0
- xlrd=2.0.1
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index 74cab4e0970dc..5688d3143e621 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 092ca18d61259..7713ae0232623 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -51,7 +51,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index b6f515dceaea9..c160eae364ba2 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index bc66f8a5382c9..034653d207c0b 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -50,7 +50,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index bda959f380e8a..5d11e9574091e 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -287,7 +287,7 @@ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql
================================================================== ================== =============== ============================================
Dependency Minimum Version pip extra Notes
================================================================== ================== =============== ============================================
-`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite
+`SQLAlchemy `__ 1.4.36 postgresql, SQL support for databases other than sqlite
mysql,
sql-other
`psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy
diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst
index fc180c8161a7e..004651ac0074f 100644
--- a/doc/source/reference/groupby.rst
+++ b/doc/source/reference/groupby.rst
@@ -79,6 +79,8 @@ Function application
DataFrameGroupBy.cumsum
DataFrameGroupBy.describe
DataFrameGroupBy.diff
+ DataFrameGroupBy.ewm
+ DataFrameGroupBy.expanding
DataFrameGroupBy.ffill
DataFrameGroupBy.first
DataFrameGroupBy.head
@@ -130,6 +132,8 @@ Function application
SeriesGroupBy.cumsum
SeriesGroupBy.describe
SeriesGroupBy.diff
+ SeriesGroupBy.ewm
+ SeriesGroupBy.expanding
SeriesGroupBy.ffill
SeriesGroupBy.first
SeriesGroupBy.head
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 230332319e0ac..d830dd8277ea9 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -38,6 +38,7 @@ Other enhancements
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
+- Reverted the minimum version for the ``sqlalchemy`` optional dependency back to ``1.4.36`` (:issue:`57049`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e6fafc8b1b14c..2d74be6f503a2 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -61,6 +61,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
@@ -68,6 +69,7 @@ Other enhancements
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
- :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`)
+- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
@@ -420,6 +422,7 @@ Other Deprecations
- Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`)
- Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`)
- Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`)
+- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`)
- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
.. ---------------------------------------------------------------------------
@@ -591,6 +594,7 @@ Performance improvements
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
+- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
@@ -621,6 +625,7 @@ Performance improvements
- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
- Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
+- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`)
- Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`)
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
@@ -636,6 +641,7 @@ Bug fixes
Categorical
^^^^^^^^^^^
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
+- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`)
- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`)
-
@@ -648,6 +654,7 @@ Datetimelike
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
+- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
@@ -762,6 +769,7 @@ Plotting
- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`)
- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`)
- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
+- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`)
- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
Groupby/resample/rolling
@@ -773,6 +781,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
+- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
diff --git a/environment.yml b/environment.yml
index ca8f1996c61cf..704bf5d767b86 100644
--- a/environment.yml
+++ b/environment.yml
@@ -54,7 +54,7 @@ dependencies:
- pyxlsb>=1.0.10
- s3fs>=2022.11.0
- scipy>=1.10.0
- - sqlalchemy>=2.0.0
+ - sqlalchemy>=1.4.36
- tabulate>=0.9.0
- xarray>=2022.12.0, <=2024.9.0
- xlrd>=2.0.1
@@ -80,7 +80,7 @@ dependencies:
- flake8=7.1.0 # run in subprocess over docstring examples
- mypy=1.13.0 # pre-commit uses locally installed mypy
- tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py
- - pre-commit>=4.0.1
+ - pre-commit>=4.2.0
# documentation
- gitpython # obtain contributors from git for whatsnew
diff --git a/pandas/__init__.py b/pandas/__init__.py
index c570fb8d70204..5dc6a8c3bc50c 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -4,19 +4,17 @@
# Let users know if they're missing any of our hard dependencies
_hard_dependencies = ("numpy", "dateutil")
-_missing_dependencies = []
for _dependency in _hard_dependencies:
try:
__import__(_dependency)
except ImportError as _e: # pragma: no cover
- _missing_dependencies.append(f"{_dependency}: {_e}")
+ raise ImportError(
+ f"Unable to import required dependency {_dependency}. "
+ "Please see the traceback for details."
+ ) from _e
-if _missing_dependencies: # pragma: no cover
- raise ImportError(
- "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
- )
-del _hard_dependencies, _dependency, _missing_dependencies
+del _hard_dependencies, _dependency
try:
# numpy compat
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
index 979a5666661b2..c885543b2fc6d 100644
--- a/pandas/_libs/tslibs/timedeltas.pyi
+++ b/pandas/_libs/tslibs/timedeltas.pyi
@@ -3,7 +3,6 @@ from typing import (
ClassVar,
Literal,
TypeAlias,
- TypeVar,
overload,
)
@@ -60,7 +59,6 @@ UnitChoices: TypeAlias = Literal[
"nanos",
"nanosecond",
]
-_S = TypeVar("_S", bound=timedelta)
def get_unit_for_round(freq, creso: int) -> int: ...
def disallow_ambiguous_unit(unit: str | None) -> None: ...
@@ -95,11 +93,11 @@ class Timedelta(timedelta):
_value: int # np.int64
# error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
def __new__( # type: ignore[misc]
- cls: type[_S],
+ cls: type[Self],
value=...,
unit: str | None = ...,
**kwargs: float | np.integer | np.floating,
- ) -> _S | NaTType: ...
+ ) -> Self | NaTType: ...
@classmethod
def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ...
@property
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 23197b9a55afc..390267db8267f 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -200,8 +200,9 @@ class MinMaxReso:
See also: timedeltas.MinMaxReso
"""
- def __init__(self, name):
+ def __init__(self, name, docstring):
self._name = name
+ self.__doc__ = docstring
def __get__(self, obj, type=None):
cls = Timestamp
@@ -216,11 +217,15 @@ class MinMaxReso:
if obj is None:
# i.e. this is on the class, default to nanos
- return cls(val)
+ result = cls(val)
elif self._name == "resolution":
- return Timedelta._from_value_and_reso(val, obj._creso)
+ result = Timedelta._from_value_and_reso(val, obj._creso)
else:
- return Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+ result = Timestamp._from_value_and_reso(val, obj._creso, tz=None)
+
+ result.__doc__ = self.__doc__
+
+ return result
def __set__(self, obj, value):
raise AttributeError(f"{self._name} is not settable.")
@@ -235,9 +240,74 @@ cdef class _Timestamp(ABCTimestamp):
dayofweek = _Timestamp.day_of_week
dayofyear = _Timestamp.day_of_year
- min = MinMaxReso("min")
- max = MinMaxReso("max")
- resolution = MinMaxReso("resolution") # GH#21336, GH#21365
+ _docstring_min = """
+ Returns the minimum bound possible for Timestamp.
+
+ This property provides access to the smallest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.min
+ Timestamp('1677-09-21 00:12:43.145224193')
+ """
+
+ _docstring_max = """
+ Returns the maximum bound possible for Timestamp.
+
+ This property provides access to the largest possible value that
+ can be represented by a Timestamp object.
+
+ Returns
+ -------
+ Timestamp
+
+ See Also
+ --------
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+ Timestamp.resolution: Returns the smallest possible difference between
+ non-equal Timestamp objects.
+
+ Examples
+ --------
+ >>> pd.Timestamp.max
+ Timestamp('2262-04-11 23:47:16.854775807')
+ """
+
+ _docstring_reso = """
+ Returns the smallest possible difference between non-equal Timestamp objects.
+
+ The resolution value is determined by the underlying representation of time
+ units and is equivalent to Timedelta(nanoseconds=1).
+
+ Returns
+ -------
+ Timedelta
+
+ See Also
+ --------
+ Timestamp.max: Returns the maximum bound possible for Timestamp.
+ Timestamp.min: Returns the minimum bound possible for Timestamp.
+
+ Examples
+ --------
+ >>> pd.Timestamp.resolution
+ Timedelta('0 days 00:00:00.000000001')
+ """
+
+ min = MinMaxReso("min", _docstring_min)
+ max = MinMaxReso("max", _docstring_max)
+ resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365
@property
def value(self) -> int:
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 6b90389a62056..1d5a68c8f5d8a 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -47,7 +47,7 @@
"pyxlsb": "1.0.10",
"s3fs": "2022.11.0",
"scipy": "1.10.0",
- "sqlalchemy": "2.0.0",
+ "sqlalchemy": "1.4.36",
"tables": "3.8.0",
"tabulate": "0.9.0",
"xarray": "2022.12.0",
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 76f2fdad591ff..e6847b380a7e8 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -215,7 +215,7 @@ def _reconstruct_data(
values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment]
else:
- values = values.astype(dtype, copy=False)
+ values = values.astype(dtype, copy=False) # type: ignore[assignment]
return values
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index da6124307e3f1..2c96f1ef020ac 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -327,7 +327,7 @@ def transform(self) -> DataFrame | Series:
if is_series:
func = {com.get_callable_name(v) or v: v for v in func}
else:
- func = {col: func for col in obj}
+ func = dict.fromkeys(obj, func)
if is_dict_like(func):
func = cast(AggFuncTypeDict, func)
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 8a920d1849bb3..eb5026454552c 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -102,7 +102,7 @@ def quantile_with_mask(
interpolation=interpolation,
)
- result = np.asarray(result)
+ result = np.asarray(result) # type: ignore[assignment]
result = result.T
return result
@@ -196,7 +196,7 @@ def _nanquantile(
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
- _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
+ _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type]
for (val, m) in zip(list(values), list(mask))
]
if values.dtype.kind == "f":
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 4e6f20e6ad3dd..26585e7bab8e3 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -142,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike:
dt64_values = arr.view(dtype)
return DatetimeArray._simple_new(dt64_values, dtype=dtype)
-
elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
from pandas.core.arrays import TimedeltaArray
td64_values = arr.view(dtype)
return TimedeltaArray._simple_new(td64_values, dtype=dtype)
-
- # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
- # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
- # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
- # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- return arr.view(dtype=dtype) # type: ignore[arg-type]
+ return arr.view(dtype=dtype)
def take(
self,
diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py
index 285c3fd465ffc..7da83e2257e30 100644
--- a/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/pandas/core/arrays/arrow/_arrow_utils.py
@@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask(
mask = pyarrow.BooleanArray.from_buffers(
pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
)
- mask = np.asarray(mask)
+ mask = np.asarray(mask) # type: ignore[assignment]
else:
mask = np.ones(len(arr), dtype=bool)
return data, mask
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 9295cf7873d98..d7187b57a69e4 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2540,7 +2540,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
dummies_dtype = np.bool_
dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
dummies[indices] = True
- dummies = dummies.reshape((n_rows, n_cols))
+ dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment]
result = type(self)(pa.array(list(dummies)))
return result, uniques_sorted.to_pylist()
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 42be07e03bad8..d0048e122051a 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -596,7 +596,7 @@ def to_numpy(
if copy or na_value is not lib.no_default:
result = result.copy()
if na_value is not lib.no_default:
- result[self.isna()] = na_value
+ result[self.isna()] = na_value # type: ignore[index]
return result
# ------------------------------------------------------------------------
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 647530151d5f6..df1aa21e9203c 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -452,7 +452,7 @@ def __init__(
if isinstance(values, Index):
arr = values._data._pa_array.combine_chunks()
else:
- arr = values._pa_array.combine_chunks()
+ arr = extract_array(values)._pa_array.combine_chunks()
categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
codes = arr.indices.to_numpy()
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
@@ -1853,7 +1853,7 @@ def value_counts(self, dropna: bool = True) -> Series:
count = np.bincount(obs, minlength=ncat or 0)
else:
count = np.bincount(np.where(mask, code, ncat))
- ix = np.append(ix, -1)
+ ix = np.append(ix, -1) # type: ignore[assignment]
ix = coerce_indexer_dtype(ix, self.dtype.categories)
ix_categorical = self._from_backing_data(ix)
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index b27bf19f2f593..994d7b1d0081c 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -2394,7 +2394,7 @@ def take(
)
indices = np.asarray(indices, dtype=np.intp)
- maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
+ maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type]
if isinstance(maybe_slice, slice):
freq = self._get_getitem_freq(maybe_slice)
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index df40c9c11b117..b31c543188282 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -331,7 +331,7 @@ def _simple_new( # type: ignore[override]
else:
# DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC],
# then values.dtype should be M8[us].
- assert dtype._creso == get_unit_from_dtype(values.dtype)
+ assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr]
result = super()._simple_new(values, dtype)
result._freq = freq
@@ -542,7 +542,7 @@ def _unbox_scalar(self, value) -> np.datetime64:
raise ValueError("'value' should be a Timestamp.")
self._check_compatible_with(value)
if value is NaT:
- return np.datetime64(value._value, self.unit)
+ return np.datetime64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
@@ -813,10 +813,7 @@ def _add_offset(self, offset: BaseOffset) -> Self:
try:
res_values = offset._apply_array(values._ndarray)
if res_values.dtype.kind == "i":
- # error: Argument 1 to "view" of "ndarray" has incompatible type
- # "dtype[datetime64] | DatetimeTZDtype"; expected
- # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]"
- res_values = res_values.view(values.dtype) # type: ignore[arg-type]
+ res_values = res_values.view(values.dtype)
except NotImplementedError:
if get_option("performance_warnings"):
warnings.warn(
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 07c875337e4f6..62e6119204bd5 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -515,7 +515,7 @@ def tolist(self) -> list:
if self.ndim > 1:
return [x.tolist() for x in self]
dtype = None if self._hasna else self._data.dtype
- return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()
+ return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() # type: ignore[return-value]
@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ...
@@ -1497,10 +1497,10 @@ def all(
result = values.all(axis=axis)
if skipna:
- return result
+ return result # type: ignore[return-value]
else:
if not result or len(self) == 0 or not self._mask.any():
- return result
+ return result # type: ignore[return-value]
else:
return self.dtype.na_value
diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py
index cc9fd2d5fb8b0..d4ef3003583c3 100644
--- a/pandas/core/arrays/sparse/scipy_sparse.py
+++ b/pandas/core/arrays/sparse/scipy_sparse.py
@@ -79,7 +79,7 @@ def _levels_to_axis(
ax_coords = codes[valid_ilocs]
ax_labels = ax_labels.tolist()
- return ax_coords, ax_labels
+ return ax_coords, ax_labels # pyright: ignore[reportReturnType]
def _to_ijv(
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index d35083fd892a8..a39d64429d162 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -281,7 +281,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
]
# short-circuit to return all False array.
- if not len(value_set):
+ if not value_set:
return np.zeros(len(self), dtype=bool)
result = pc.is_in(
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index c5b3129c506c8..9012b9f36348a 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -325,7 +325,7 @@ def _unbox_scalar(self, value) -> np.timedelta64:
raise ValueError("'value' should be a Timedelta.")
self._check_compatible_with(value)
if value is NaT:
- return np.timedelta64(value._value, self.unit)
+ return np.timedelta64(value._value, self.unit) # type: ignore[call-overload]
else:
return value.as_unit(self.unit, round_ok=False).asm8
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 6cc28d4e46634..8304af48c39ac 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -875,7 +875,7 @@ def tolist(self) -> list:
>>> idx.to_list()
[1, 2, 3]
"""
- return self._values.tolist()
+ return self._values.tolist() # type: ignore[return-value]
to_list = tolist
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 6a45ef9325bec..884107d4bc6af 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -9705,7 +9705,7 @@ def _where(
# CoW: Make sure reference is not kept alive
if cond.ndim == 1 and self.ndim == 2:
cond = cond._constructor_expanddim(
- {i: cond for i in range(len(self.columns))},
+ dict.fromkeys(range(len(self.columns)), cond),
copy=False,
)
cond.columns = self.columns
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 1251403db6ff3..a1c1163435611 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -2142,7 +2142,7 @@ def _wrap_applied_output_series(
if stacked_values.dtype == object:
# We'll have the DataFrame constructor do inference
- stacked_values = stacked_values.tolist()
+ stacked_values = stacked_values.tolist() # type: ignore[assignment]
result = self.obj._constructor(stacked_values, index=index, columns=columns)
if not self.as_index:
@@ -2505,7 +2505,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
)
results = [func(sgb) for sgb in sgbs]
- if not len(results):
+ if not results:
# concat would raise
res_df = DataFrame([], columns=columns, index=self._grouper.result_index)
else:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f9438b348c140..7d58d8f867c12 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1878,7 +1878,7 @@ def _apply_filter(self, indices, dropna):
mask.fill(False)
mask[indices.astype(int)] = True
# mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
+ mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T # type: ignore[assignment]
filtered = self._selected_obj.where(mask) # Fill with NaNs.
return filtered
@@ -3803,16 +3803,58 @@ def rolling(
)
@final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
def expanding(self, *args, **kwargs) -> ExpandingGroupby:
"""
- Return an expanding grouper, providing expanding
- functionality per group.
+ Return an expanding grouper, providing expanding functionality per group.
+
+ Arguments are the same as `:meth:DataFrame.rolling` except that ``step`` cannot
+ be specified.
+
+ Parameters
+ ----------
+ *args : tuple
+ Positional arguments passed to the expanding window constructor.
+ **kwargs : dict
+ Keyword arguments passed to the expanding window constructor.
Returns
-------
pandas.api.typing.ExpandingGroupby
+ An object that supports expanding transformations over each group.
+
+ See Also
+ --------
+ Series.expanding : Expanding transformations for Series.
+ DataFrame.expanding : Expanding transformations for DataFrames.
+ Series.groupby : Apply a function groupby to a Series.
+ DataFrame.groupby : Apply a function groupby.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(
+ ... {
+ ... "Class": ["A", "A", "A", "B", "B", "B"],
+ ... "Value": [10, 20, 30, 40, 50, 60],
+ ... }
+ ... )
+ >>> df
+ Class Value
+ 0 A 10
+ 1 A 20
+ 2 A 30
+ 3 B 40
+ 4 B 50
+ 5 B 60
+
+ >>> df.groupby("Class").expanding().mean()
+ Value
+ Class
+ A 0 10.0
+ 1 15.0
+ 2 20.0
+ B 3 40.0
+ 4 45.0
+ 5 50.0
"""
from pandas.core.window import ExpandingGroupby
@@ -3824,15 +3866,79 @@ def expanding(self, *args, **kwargs) -> ExpandingGroupby:
)
@final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
"""
Return an ewm grouper, providing ewm functionality per group.
+ Parameters
+ ----------
+ *args : tuple
+ Positional arguments passed to the EWM window constructor.
+ **kwargs : dict
+ Keyword arguments passed to the EWM window constructor, such as:
+
+ com : float, optional
+ Specify decay in terms of center of mass.
+ ``span``, ``halflife``, and ``alpha`` are alternative ways to specify
+ decay.
+ span : float, optional
+ Specify decay in terms of span.
+ halflife : float, optional
+ Specify decay in terms of half-life.
+ alpha : float, optional
+ Specify smoothing factor directly.
+ min_periods : int, default 0
+ Minimum number of observations in the window required to have a value;
+ otherwise, result is ``np.nan``.
+ adjust : bool, default True
+ Divide by decaying adjustment factor to account for imbalance in
+ relative weights.
+ ignore_na : bool, default False
+ Ignore missing values when calculating weights.
+ times : str or array-like of datetime64, optional
+ Times corresponding to the observations.
+ axis : {0 or 'index', 1 or 'columns'}, default 0
+ Axis along which the EWM function is applied.
+
Returns
-------
pandas.api.typing.ExponentialMovingWindowGroupby
+ An object that supports exponentially weighted moving transformations over
+ each group.
+
+ See Also
+ --------
+ Series.ewm : EWM transformations for Series.
+ DataFrame.ewm : EWM transformations for DataFrames.
+ Series.groupby : Apply a function groupby to a Series.
+ DataFrame.groupby : Apply a function groupby.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(
+ ... {
+ ... "Class": ["A", "A", "A", "B", "B", "B"],
+ ... "Value": [10, 20, 30, 40, 50, 60],
+ ... }
+ ... )
+ >>> df
+ Class Value
+ 0 A 10
+ 1 A 20
+ 2 A 30
+ 3 B 40
+ 4 B 50
+ 5 B 60
+
+ >>> df.groupby("Class").ewm(com=0.5).mean()
+ Value
+ Class
+ A 0 10.000000
+ 1 17.500000
+ 2 26.153846
+ B 3 40.000000
+ 4 47.500000
+ 5 56.153846
"""
from pandas.core.window import ExponentialMovingWindowGroupby
@@ -4441,11 +4547,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
)
if vals.ndim == 1:
- out = out.ravel("K")
+ out = out.ravel("K") # type: ignore[assignment]
if result_mask is not None:
- result_mask = result_mask.ravel("K")
+ result_mask = result_mask.ravel("K") # type: ignore[assignment]
else:
- out = out.reshape(ncols, ngroups * nqs)
+ out = out.reshape(ncols, ngroups * nqs) # type: ignore[assignment]
return post_processor(out, inference, result_mask, orig_vals)
@@ -5175,8 +5281,8 @@ def diff(
shifted = shifted.astype("float32")
else:
to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
- if len(to_coerce):
- shifted = shifted.astype({c: "float32" for c in to_coerce})
+ if to_coerce:
+ shifted = shifted.astype(dict.fromkeys(to_coerce, "float32"))
return obj - shifted
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index c4c7f73ee166c..75f3495041917 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -1131,7 +1131,7 @@ def get_iterator(self, data: NDFrame):
"""
slicer = lambda start, edge: data.iloc[start:edge]
- start = 0
+ start: np.int64 | int = 0
for edge, label in zip(self.bins, self.binlabels):
if label is not NaT:
yield label, slicer(start, edge)
@@ -1144,7 +1144,7 @@ def get_iterator(self, data: NDFrame):
def indices(self):
indices = collections.defaultdict(list)
- i = 0
+ i: np.int64 | int = 0
for label, bin in zip(self.binlabels, self.bins):
if i < bin:
if label is not NaT:
diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py
index 88379164534f2..6fc638e85bc5e 100644
--- a/pandas/core/indexers/objects.py
+++ b/pandas/core/indexers/objects.py
@@ -131,8 +131,8 @@ def get_window_bounds(
if closed in ["left", "neither"]:
end -= 1
- end = np.clip(end, 0, num_values)
- start = np.clip(start, 0, num_values)
+ end = np.clip(end, 0, num_values) # type: ignore[assignment]
+ start = np.clip(start, 0, num_values) # type: ignore[assignment]
return start, end
@@ -402,7 +402,7 @@ def get_window_bounds(
start = np.arange(0, num_values, step, dtype="int64")
end = start + self.window_size
if self.window_size:
- end = np.clip(end, 0, num_values)
+ end = np.clip(end, 0, num_values) # type: ignore[assignment]
return start, end
@@ -488,7 +488,7 @@ def get_window_bounds(
)
window_indices_start += len(indices)
# Extend as we'll be slicing window like [start, end)
- window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
+ window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( # type: ignore[assignment]
np.int64, copy=False
)
start_arrays.append(window_indices.take(ensure_platform_int(start)))
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 13811c28e6c1e..8c40b630e8cfd 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -1279,14 +1279,7 @@ def interval_range(
breaks = np.linspace(start, end, periods)
if all(is_integer(x) for x in com.not_none(start, end, freq)):
# np.linspace always produces float output
-
- # error: Argument 1 to "maybe_downcast_numeric" has incompatible type
- # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]";
- # expected "ndarray[Any, Any]" [
- breaks = maybe_downcast_numeric(
- breaks, # type: ignore[arg-type]
- dtype,
- )
+ breaks = maybe_downcast_numeric(breaks, dtype)
else:
# delegate to the appropriate range function
if isinstance(endpoint, Timestamp):
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index bbbcc4da9fb39..34a437ba40bd8 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1582,11 +1582,7 @@ def _validate_key(self, key, axis: AxisInt) -> None:
if com.is_bool_indexer(key):
if hasattr(key, "index") and isinstance(key.index, Index):
if key.index.inferred_type == "integer":
- raise NotImplementedError(
- "iLocation based boolean "
- "indexing on an integer type "
- "is not available"
- )
+ return
raise ValueError(
"iLocation based boolean indexing cannot use an indexable as a mask"
)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index dc64da35e9725..6aa5062b8ed86 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -805,7 +805,7 @@ def replace_list(
for x, y in zip(src_list, dest_list)
if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
]
- if not len(pairs):
+ if not pairs:
return [self.copy(deep=False)]
src_len = len(pairs) - 1
@@ -1679,6 +1679,8 @@ def where(self, other, cond) -> list[Block]:
try:
res_values = arr._where(cond, other).T
+ except OutOfBoundsDatetime:
+ raise
except (ValueError, TypeError):
if self.ndim == 1 or self.shape[0] == 1:
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
@@ -1746,6 +1748,8 @@ def putmask(self, mask, new) -> list[Block]:
try:
# Caller is responsible for ensuring matching lengths
values._putmask(mask, new)
+ except OutOfBoundsDatetime:
+ raise
except (TypeError, ValueError):
if self.ndim == 1 or self.shape[0] == 1:
if isinstance(self.dtype, IntervalDtype):
@@ -2094,7 +2098,7 @@ def _unstack(
self.values.take(
indices, allow_fill=needs_masking[i], fill_value=fill_value
),
- BlockPlacement(place),
+ BlockPlacement(place), # type: ignore[arg-type]
ndim=2,
)
for i, (indices, place) in enumerate(zip(new_values, new_placement))
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 69da2be0306f6..35de97d570bd3 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -634,7 +634,7 @@ def reorder_arrays(
arr = np.empty(length, dtype=object)
arr.fill(np.nan)
else:
- arr = arrays[k]
+ arr = arrays[k] # type: ignore[assignment]
new_arrays.append(arr)
arrays = new_arrays
@@ -864,7 +864,7 @@ def _finalize_columns_and_data(
# GH#26429 do not raise user-facing AssertionError
raise ValueError(err) from err
- if len(contents) and contents[0].dtype == np.object_:
+ if contents and contents[0].dtype == np.object_:
contents = convert_object_array(contents, dtype=dtype)
return contents, columns
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index a3738bb25f56c..e238bb78bbdfa 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1298,7 +1298,7 @@ def value_getitem(placement):
# Defer setting the new values to enable consolidation
self._iset_split_block(blkno_l, blk_locs, refs=refs)
- if len(removed_blknos):
+ if removed_blknos:
# Remove blocks & update blknos accordingly
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
is_deleted[removed_blknos] = True
diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py
index 02e7445f1d275..59516b16905dc 100644
--- a/pandas/core/methods/selectn.py
+++ b/pandas/core/methods/selectn.py
@@ -11,6 +11,7 @@
from typing import (
TYPE_CHECKING,
Generic,
+ Literal,
cast,
final,
)
@@ -54,7 +55,9 @@
class SelectN(Generic[NDFrameT]):
- def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
+ def __init__(
+ self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
+ ) -> None:
self.obj = obj
self.n = n
self.keep = keep
@@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
if n <= 0:
return self.obj[[]]
- dropped = self.obj.dropna()
- nan_index = self.obj.drop(dropped.index)
+ # Save index and reset to default index to avoid performance impact
+ # from when index contains duplicates
+ original_index: Index = self.obj.index
+ default_index = self.obj.reset_index(drop=True)
- # slow method
- if n >= len(self.obj):
+ # Slower method used when taking the full length of the series
+ # In this case, it is equivalent to a sort.
+ if n >= len(default_index):
ascending = method == "nsmallest"
- return self.obj.sort_values(ascending=ascending).head(n)
+ result = default_index.sort_values(ascending=ascending, kind="stable").head(
+ n
+ )
+ result.index = original_index.take(result.index)
+ return result
+
+ # Fast method used in the general case
+ dropped = default_index.dropna()
+ nan_index = default_index.drop(dropped.index)
- # fast method
new_dtype = dropped.dtype
# Similar to algorithms._ensure_data
@@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
else:
kth_val = np.nan
(ns,) = np.nonzero(arr <= kth_val)
- inds = ns[arr[ns].argsort(kind="mergesort")]
+ inds = ns[arr[ns].argsort(kind="stable")]
if self.keep != "all":
inds = inds[:n]
@@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
# reverse indices
inds = narr - 1 - inds
- return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+ result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
+ result.index = original_index.take(result.index)
+ return result
class SelectNFrame(SelectN[DataFrame]):
@@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
nordered : DataFrame
"""
- def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
+ def __init__(
+ self,
+ obj: DataFrame,
+ n: int,
+ keep: Literal["first", "last", "all"],
+ columns: IndexLabel,
+ ) -> None:
super().__init__(obj, n, keep)
if not is_list_like(columns) or isinstance(columns, tuple):
columns = [columns]
@@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:
ascending = method == "nsmallest"
- return frame.sort_values(columns, ascending=ascending, kind="mergesort")
+ return frame.sort_values(columns, ascending=ascending, kind="stable")
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index e2fb3b9a6fc0b..66609fa870f14 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -241,7 +241,8 @@ def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None:
return None
if is_valid.ndim == 2:
- is_valid = is_valid.any(axis=1) # reduce axis 1
+ # reduce axis 1
+ is_valid = is_valid.any(axis=1) # type: ignore[assignment]
if how == "first":
idxpos = is_valid[::].argmax()
@@ -404,10 +405,7 @@ def func(yvalues: np.ndarray) -> None:
**kwargs,
)
- # error: No overload variant of "apply_along_axis" matches
- # argument types "Callable[[ndarray[Any, Any]], None]",
- # "int", "ndarray[Any, Any]"
- np.apply_along_axis(func, axis, data) # type: ignore[call-overload]
+ np.apply_along_axis(func, axis, data)
def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 753f7fb6cea1a..08e3beef99e60 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -518,6 +518,7 @@ def _wrap_result(self, result):
if self._timegrouper._arrow_dtype is not None:
result.index = result.index.astype(self._timegrouper._arrow_dtype)
+ result.index.name = self.obj.index.name
return result
diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
index 6a590ee5b227e..54704b274b74f 100644
--- a/pandas/core/reshape/encoding.py
+++ b/pandas/core/reshape/encoding.py
@@ -357,7 +357,7 @@ def get_empty_frame(data) -> DataFrame:
if drop_first:
# remove first GH12042
- dummy_mat = dummy_mat[:, 1:]
+ dummy_mat = dummy_mat[:, 1:] # type: ignore[assignment]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 09be82c59a5c6..68d61da0cf7dd 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -2921,9 +2921,7 @@ def _convert_arrays_and_get_rizer_klass(
lk = lk.astype(dtype, copy=False)
rk = rk.astype(dtype, copy=False)
if isinstance(lk, BaseMaskedArray):
- # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
- # expected type "Type[object]"
- klass = _factorizers[lk.dtype.type] # type: ignore[index]
+ klass = _factorizers[lk.dtype.type]
elif isinstance(lk.dtype, ArrowDtype):
klass = _factorizers[lk.dtype.numpy_dtype.type]
else:
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index c60fe71a7ff28..d2a838b616426 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame:
[k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels]
)
- result = stack_reshape(frame, level, set_levels, stack_cols)
+ result: Series | DataFrame
+ if not isinstance(frame.columns, MultiIndex):
+ # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex.
+ # When columns are homogeneous EAs, we pass through object
+ # dtype but this is still slightly faster than the normal path.
+ if len(frame.columns) > 0 and frame._is_homogeneous_type:
+ dtype = frame._mgr.blocks[0].dtype
+ else:
+ dtype = None
+ result = frame._constructor_sliced(
+ frame._values.reshape(-1, order="F"), dtype=dtype
+ )
+ else:
+ result = stack_reshape(frame, level, set_levels, stack_cols)
# Construct the correct MultiIndex by combining the frame's index and
# stacked columns.
@@ -1018,6 +1031,8 @@ def stack_reshape(
-------
The data of behind the stacked DataFrame.
"""
+ # non-MultIndex takes a fast path.
+ assert isinstance(frame.columns, MultiIndex)
# If we need to drop `level` from columns, it needs to be in descending order
drop_levnums = sorted(level, reverse=True)
@@ -1027,18 +1042,14 @@ def stack_reshape(
if len(frame.columns) == 1:
data = frame.copy(deep=False)
else:
- if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple):
- # GH#57750 - if the frame is an Index with tuples, .loc below will fail
- column_indexer = idx
- else:
- # Take the data from frame corresponding to this idx value
- if len(level) == 1:
- idx = (idx,)
- gen = iter(idx)
- column_indexer = tuple(
- next(gen) if k in set_levels else slice(None)
- for k in range(frame.columns.nlevels)
- )
+ # Take the data from frame corresponding to this idx value
+ if len(level) == 1:
+ idx = (idx,)
+ gen = iter(idx)
+ column_indexer = tuple(
+ next(gen) if k in set_levels else slice(None)
+ for k in range(frame.columns.nlevels)
+ )
data = frame.loc[:, column_indexer]
if len(level) < frame.columns.nlevels:
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 03a2ce85a08c9..d6a982c65e9fd 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -52,6 +52,9 @@
doc,
set_module,
)
+from pandas.util._exceptions import (
+ find_stack_level,
+)
from pandas.util._validators import (
validate_ascending,
validate_bool_kwarg,
@@ -4320,7 +4323,7 @@ def unstack(
def map(
self,
- arg: Callable | Mapping | Series,
+ func: Callable | Mapping | Series | None = None,
na_action: Literal["ignore"] | None = None,
**kwargs,
) -> Series:
@@ -4333,8 +4336,8 @@ def map(
Parameters
----------
- arg : function, collections.abc.Mapping subclass or Series
- Mapping correspondence.
+ func : function, collections.abc.Mapping subclass or Series
+ Function or mapping correspondence.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NaN values, without passing them to the
mapping correspondence.
@@ -4404,9 +4407,22 @@ def map(
3 I am a rabbit
dtype: object
"""
- if callable(arg):
- arg = functools.partial(arg, **kwargs)
- new_values = self._map_values(arg, na_action=na_action)
+ if func is None:
+ if "arg" in kwargs:
+ # `.map(arg=my_func)`
+ func = kwargs.pop("arg")
+ warnings.warn(
+ "The parameter `arg` has been renamed to `func`, and it "
+ "will stop being supported in a future version of pandas.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
+ else:
+ raise ValueError("The `func` parameter is required")
+
+ if callable(func):
+ func = functools.partial(func, **kwargs)
+ new_values = self._map_values(func, na_action=na_action)
return self._constructor(new_values, index=self.index, copy=False).__finalize__(
self, method="map"
)
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 0d8f42694ccb4..18983af12976c 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -476,7 +476,7 @@ def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
zipped = zip(arr_values, mask)
else:
zipped = zip(arr_values.T, mask.T)
- return np.array([_nanargminmax(v, m, func) for v, m in zipped])
+ return np.array([_nanargminmax(v, m, func) for v, m in zipped]) # type: ignore[arg-type]
return func(arr_values, axis=axis)
return _nanargminmax(arr_values, mask, func)
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index ebcafce8f4de2..1dc6c1f08b49a 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -89,7 +89,7 @@
)
_read_excel_doc = (
"""
-Read an Excel file into a ``pandas`` ``DataFrame``.
+Read an Excel file into a ``DataFrame``.
Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
read from a local filesystem or URL. Supports an option to read
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index fb799361fea67..189dfc1dde6aa 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -566,7 +566,7 @@ def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceTyp
result = {}
elif isinstance(col_space, (int, str)):
result = {"": col_space}
- result.update({column: col_space for column in self.frame.columns})
+ result.update(dict.fromkeys(self.frame.columns, col_space))
elif isinstance(col_space, Mapping):
for column in col_space.keys():
if column not in self.frame.columns and column != "":
@@ -1495,7 +1495,7 @@ def _format_strings(self) -> list[str]:
fmt_values = values._format_native_types(
na_rep=self.nat_rep, date_format=self.date_format
)
- return fmt_values.tolist()
+ return fmt_values.tolist() # type: ignore[return-value]
class _ExtensionArrayFormatter(_GenericArrayFormatter):
diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py
index c9a6e94a0c7c1..eb579f7149d44 100644
--- a/pandas/io/formats/info.py
+++ b/pandas/io/formats/info.py
@@ -249,7 +249,7 @@
Print a concise summary of a {klass}.
This method prints information about a {klass} including
- the index dtype{type_sub}, non-null values and memory usage.
+ the index dtype{type_sub}, non-NA values and memory usage.
{version_added_sub}\
Parameters
diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
index 482ed316c7ce4..6752c83d5169b 100644
--- a/pandas/io/formats/style_render.py
+++ b/pandas/io/formats/style_render.py
@@ -1225,7 +1225,7 @@ def format(
data = self.data.loc[subset]
if not isinstance(formatter, dict):
- formatter = {col: formatter for col in data.columns}
+ formatter = dict.fromkeys(data.columns, formatter)
cis = self.columns.get_indexer_for(data.columns)
ris = self.index.get_indexer_for(data.index)
@@ -1411,7 +1411,7 @@ def format_index(
return self # clear the formatter / revert to default and avoid looping
if not isinstance(formatter, dict):
- formatter = {level: formatter for level in levels_}
+ formatter = dict.fromkeys(levels_, formatter)
else:
formatter = {
obj._get_level_number(level): formatter_
@@ -1708,7 +1708,7 @@ def format_index_names(
return self # clear the formatter / revert to default and avoid looping
if not isinstance(formatter, dict):
- formatter = {level: formatter for level in levels_}
+ formatter = dict.fromkeys(levels_, formatter)
else:
formatter = {
obj._get_level_number(level): formatter_
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index e7b5c7f06a79a..547d8c1fe3d19 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -1468,7 +1468,7 @@ def detect_colspecs(
shifted[0] = 0
edges = np.where((mask ^ shifted) == 1)[0]
edge_pairs = list(zip(edges[::2], edges[1::2]))
- return edge_pairs
+ return edge_pairs # type: ignore[return-value]
def __next__(self) -> list[str]:
# Argument 1 to "next" has incompatible type "Union[IO[str],
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index a689cfbcb1418..c58b4a4be6df1 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -39,6 +39,7 @@
)
from pandas._libs.lib import is_string_array
from pandas._libs.tslibs import timezones
+from pandas.compat import HAS_PYARROW
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
from pandas.errors import (
@@ -381,6 +382,13 @@ def read_hdf(
DataFrame.to_hdf : Write a HDF file from a DataFrame.
HDFStore : Low-level access to HDF files.
+ Notes
+ -----
+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
+ to UTF-8, the resulting dtype will be
+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
+
Examples
--------
>>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP
@@ -1760,7 +1768,7 @@ def info(self) -> str:
if self.is_open:
lkeys = sorted(self.keys())
- if len(lkeys):
+ if lkeys:
keys = []
values = []
@@ -2257,6 +2265,20 @@ def convert(
# making an Index instance could throw a number of different errors
try:
new_pd_index = factory(values, **kwargs)
+ except UnicodeEncodeError as err:
+ if (
+ errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ new_pd_index = factory(
+ values,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ **kwargs,
+ )
+ else:
+ raise
except ValueError:
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
@@ -3170,12 +3192,29 @@ def read_index_node(
**kwargs,
)
else:
- index = factory(
- _unconvert_index(
- data, kind, encoding=self.encoding, errors=self.errors
- ),
- **kwargs,
- )
+ try:
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ **kwargs,
+ )
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ **kwargs,
+ )
+ else:
+ raise
index.name = name
@@ -3311,13 +3350,24 @@ def read(
self.validate_read(columns, where)
index = self.read_index("index", start=start, stop=stop)
values = self.read_array("values", start=start, stop=stop)
- result = Series(values, index=index, name=self.name, copy=False)
- if (
- using_string_dtype()
- and isinstance(values, np.ndarray)
- and is_string_array(values, skipna=True)
- ):
- result = result.astype(StringDtype(na_value=np.nan))
+ try:
+ result = Series(values, index=index, name=self.name, copy=False)
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ result = Series(
+ values,
+ index=index,
+ name=self.name,
+ copy=False,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ )
+ else:
+ raise
return result
def write(self, obj, **kwargs) -> None:
@@ -4540,7 +4590,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
masks.append(mask.astype("u1", copy=False))
# consolidate masks
- if len(masks):
+ if masks:
mask = masks[0]
for m in masks[1:]:
mask = mask & m
@@ -4660,7 +4710,7 @@ def delete(
groups = list(diff[diff > 1].index)
# 1 group
- if not len(groups):
+ if not groups:
groups = [0]
# final element
@@ -4764,7 +4814,24 @@ def read(
values = values.reshape((1, values.shape[0]))
if isinstance(values, (np.ndarray, DatetimeArray)):
- df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+ try:
+ df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+ except UnicodeEncodeError as err:
+ if (
+ self.errors == "surrogatepass"
+ and get_option("future.infer_string")
+ and str(err).endswith("surrogates not allowed")
+ and HAS_PYARROW
+ ):
+ df = DataFrame(
+ values.T,
+ columns=cols_,
+ index=index_,
+ copy=False,
+ dtype=StringDtype(storage="python", na_value=np.nan),
+ )
+ else:
+ raise
elif isinstance(values, Index):
df = DataFrame(values, columns=cols_, index=index_)
else:
@@ -4774,23 +4841,10 @@ def read(
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
# If str / string dtype is stored in meta, use that.
- converted = False
for column in cols_:
dtype = getattr(self.table.attrs, f"{column}_meta", None)
if dtype in ["str", "string"]:
df[column] = df[column].astype(dtype)
- converted = True
- # Otherwise try inference.
- if (
- not converted
- and using_string_dtype()
- and isinstance(values, np.ndarray)
- and is_string_array(
- values,
- skipna=True,
- )
- ):
- df = df.astype(StringDtype(na_value=np.nan))
frames.append(df)
if len(frames) == 1:
@@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
# encode if needed
if len(data):
data = (
- Series(data.ravel(), copy=False)
+ Series(data.ravel(), copy=False, dtype="object")
.str.encode(encoding, errors)
._values.reshape(data.shape)
)
@@ -5264,7 +5318,9 @@ def _unconvert_string_array(
dtype = f"U{itemsize}"
if isinstance(data[0], bytes):
- ser = Series(data, copy=False).str.decode(encoding, errors=errors)
+ ser = Series(data, copy=False).str.decode(
+ encoding, errors=errors, dtype="object"
+ )
data = ser.to_numpy()
data.flags.writeable = True
else:
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 0e0f07c0f8ff3..7376843f7e8ff 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -1901,7 +1901,7 @@ def prep_table(
# Type[str], Type[float], Type[int], Type[complex], Type[bool],
# Type[object]]]]"; expected type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
+ dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
else:
dtype = cast(dict, dtype)
@@ -2615,7 +2615,7 @@ def _create_table_setup(self):
]
ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
- if len(ix_cols):
+ if ix_cols:
cnames = "_".join(ix_cols)
cnames_br = ",".join([escape(c) for c in ix_cols])
create_stmts.append(
@@ -2859,7 +2859,7 @@ def to_sql(
# Type[str], Type[float], Type[int], Type[complex], Type[bool],
# Type[object]]]]"; expected type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
+ dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type]
else:
dtype = cast(dict, dtype)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index 1035150302d2c..24aa848de1b4c 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -64,10 +64,9 @@
from pandas.plotting._matplotlib.misc import unpack_single_str_list
from pandas.plotting._matplotlib.style import get_standard_colors
from pandas.plotting._matplotlib.timeseries import (
- decorate_axes,
format_dateaxis,
maybe_convert_index,
- maybe_resample,
+ prepare_ts_data,
use_dynamic_x,
)
from pandas.plotting._matplotlib.tools import (
@@ -288,6 +287,21 @@ def __init__(
self.data = self._ensure_frame(self.data)
+ from pandas.plotting import plot_params
+
+ self.x_compat = plot_params["x_compat"]
+ if "x_compat" in self.kwds:
+ self.x_compat = bool(self.kwds.pop("x_compat"))
+
+ @final
+ def _is_ts_plot(self) -> bool:
+ # this is slightly deceptive
+ return not self.x_compat and self.use_index and self._use_dynamic_x()
+
+ @final
+ def _use_dynamic_x(self) -> bool:
+ return use_dynamic_x(self._get_ax(0), self.data.index)
+
@final
@staticmethod
def _validate_sharex(sharex: bool | None, ax, by) -> bool:
@@ -1324,10 +1338,20 @@ def __init__(
c = self.data.columns[c]
self.c = c
+ @register_pandas_matplotlib_converters
def _make_plot(self, fig: Figure) -> None:
x, y, c, data = self.x, self.y, self.c, self.data
ax = self.axes[0]
+ from pandas import Series
+
+ x_data = data[x]
+ s = Series(index=x_data)
+ if use_dynamic_x(ax, s.index):
+ s = maybe_convert_index(ax, s)
+ freq, s = prepare_ts_data(s, ax, self.kwds)
+ x_data = s.index
+
c_is_column = is_hashable(c) and c in self.data.columns
color_by_categorical = c_is_column and isinstance(
@@ -1344,7 +1368,7 @@ def _make_plot(self, fig: Figure) -> None:
else:
label = None
- # if a list of non color strings is passed in as c, color points
+ # if a list of non-color strings is passed in as c, color points
# by uniqueness of the strings, such same strings get same color
create_colors = not self._are_valid_colors(c_values)
if create_colors:
@@ -1360,7 +1384,7 @@ def _make_plot(self, fig: Figure) -> None:
)
scatter = ax.scatter(
- data[x].values,
+ x_data.values,
data[y].values,
c=c_values,
label=label,
@@ -1520,23 +1544,9 @@ def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]:
return "line"
def __init__(self, data, **kwargs) -> None:
- from pandas.plotting import plot_params
-
MPLPlot.__init__(self, data, **kwargs)
if self.stacked:
self.data = self.data.fillna(value=0)
- self.x_compat = plot_params["x_compat"]
- if "x_compat" in self.kwds:
- self.x_compat = bool(self.kwds.pop("x_compat"))
-
- @final
- def _is_ts_plot(self) -> bool:
- # this is slightly deceptive
- return not self.x_compat and self.use_index and self._use_dynamic_x()
-
- @final
- def _use_dynamic_x(self) -> bool:
- return use_dynamic_x(self._get_ax(0), self.data)
def _make_plot(self, fig: Figure) -> None:
if self._is_ts_plot():
@@ -1626,15 +1636,8 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds):
# accept x to be consistent with normal plot func,
# x is not passed to tsplot as it uses data.index as x coordinate
# column_num must be in kwds for stacking purpose
- freq, data = maybe_resample(data, ax, kwds)
+ freq, data = prepare_ts_data(data, ax, kwds)
- # Set ax with freq info
- decorate_axes(ax, freq)
- # digging deeper
- if hasattr(ax, "left_ax"):
- decorate_axes(ax.left_ax, freq)
- if hasattr(ax, "right_ax"):
- decorate_axes(ax.right_ax, freq)
# TODO #54485
ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined]
diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py
index 962f9711d9916..6e343b176b5eb 100644
--- a/pandas/plotting/_matplotlib/style.py
+++ b/pandas/plotting/_matplotlib/style.py
@@ -273,7 +273,7 @@ def _random_color(column: int) -> list[float]:
"""Get a random color represented as a list of length 3"""
# GH17525 use common._random_state to avoid resetting the seed
rs = com.random_state(column)
- return rs.rand(3).tolist()
+ return rs.rand(3).tolist() # type: ignore[return-value]
def _is_single_string_color(color: Color) -> bool:
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index d95ccad2da565..beaf5b6259ef3 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -48,7 +48,6 @@
from pandas._typing import NDFrameT
from pandas import (
- DataFrame,
DatetimeIndex,
Index,
PeriodIndex,
@@ -231,8 +230,8 @@ def _get_freq(ax: Axes, series: Series):
return freq, ax_freq
-def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
- freq = _get_index_freq(data.index)
+def use_dynamic_x(ax: Axes, index: Index) -> bool:
+ freq = _get_index_freq(index)
ax_freq = _get_ax_freq(ax)
if freq is None: # convert irregular if axes has freq info
@@ -250,16 +249,15 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
return False
# FIXME: hack this for 0.10.1, creating more technical debt...sigh
- if isinstance(data.index, ABCDatetimeIndex):
+ if isinstance(index, ABCDatetimeIndex):
# error: "BaseOffset" has no attribute "_period_dtype_code"
freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str)
base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined]
- x = data.index
if base <= FreqGroup.FR_DAY.value:
- return x[:1].is_normalized
- period = Period(x[0], freq_str)
+ return index[:1].is_normalized
+ period = Period(index[0], freq_str)
assert isinstance(period, Period)
- return period.to_timestamp().tz_localize(x.tz) == x[0]
+ return period.to_timestamp().tz_localize(index.tz) == index[0]
return True
@@ -366,3 +364,19 @@ def format_dateaxis(
raise TypeError("index type not supported")
plt.draw_if_interactive()
+
+
+def prepare_ts_data(
+ series: Series, ax: Axes, kwargs: dict[str, Any]
+) -> tuple[BaseOffset | str, Series]:
+ freq, data = maybe_resample(series, ax, kwargs)
+
+ # Set ax with freq info
+ decorate_axes(ax, freq)
+ # digging deeper
+ if hasattr(ax, "left_ax"):
+ decorate_axes(ax.left_ax, freq)
+ if hasattr(ax, "right_ax"):
+ decorate_axes(ax.right_ax, freq)
+
+ return freq, data
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 2d47cd851ad10..dde1158dc7951 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -334,7 +334,7 @@ def test_apply_broadcast_scalars(float_frame):
def test_apply_broadcast_scalars_axis1(float_frame):
result = float_frame.apply(np.mean, axis=1, result_type="broadcast")
m = float_frame.mean(axis=1)
- expected = DataFrame({c: m for c in float_frame.columns})
+ expected = DataFrame(dict.fromkeys(float_frame.columns, m))
tm.assert_frame_equal(result, expected)
@@ -361,7 +361,7 @@ def test_apply_broadcast_lists_index(float_frame):
)
m = list(range(len(float_frame.index)))
expected = DataFrame(
- {c: m for c in float_frame.columns},
+ dict.fromkeys(float_frame.columns, m),
dtype="float64",
index=float_frame.index,
)
diff --git a/pandas/tests/dtypes/cast/test_maybe_box_native.py b/pandas/tests/dtypes/cast/test_maybe_box_native.py
index 3f62f31dac219..151586962d517 100644
--- a/pandas/tests/dtypes/cast/test_maybe_box_native.py
+++ b/pandas/tests/dtypes/cast/test_maybe_box_native.py
@@ -17,7 +17,7 @@
"obj,expected_dtype",
[
(b"\x00\x10", bytes),
- (int(4), int),
+ ((4), int),
(np.uint(4), int),
(np.int32(-4), int),
(np.uint8(4), int),
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index c61cda83cf6e0..a5b22ac30d820 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -769,8 +769,8 @@ def test_empty_like(self):
np.datetime64("NaT"),
np.timedelta64("NaT"),
]
- + [np.datetime64("NaT", unit) for unit in m8_units]
- + [np.timedelta64("NaT", unit) for unit in m8_units]
+ + [np.datetime64("NaT", unit) for unit in m8_units] # type: ignore[call-overload]
+ + [np.timedelta64("NaT", unit) for unit in m8_units] # type: ignore[call-overload]
)
inf_vals = [
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
index 2915c0585f373..a760cbc3995b3 100644
--- a/pandas/tests/extension/base/reshaping.py
+++ b/pandas/tests/extension/base/reshaping.py
@@ -3,6 +3,8 @@
import numpy as np
import pytest
+from pandas.core.dtypes.dtypes import NumpyEADtype
+
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import ExtensionArray
@@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack):
expected = expected.astype(object)
if isinstance(expected, pd.Series):
- assert result.dtype == df.iloc[:, 0].dtype
+ if future_stack and isinstance(data.dtype, NumpyEADtype):
+ # GH#58817 future_stack=True constructs the result specifying the dtype
+ # using the dtype of the input; we thus get the underlying
+ # NumPy dtype as the result instead of the NumpyExtensionArray
+ assert result.dtype == df.iloc[:, 0].to_numpy().dtype
+ else:
+ assert result.dtype == df.iloc[:, 0].dtype
else:
assert all(result.dtypes == df.iloc[:, 0].dtype)
diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
index 67d1d45af1cb3..8915d6f205d65 100644
--- a/pandas/tests/frame/methods/test_fillna.py
+++ b/pandas/tests/frame/methods/test_fillna.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest
+from pandas.errors import OutOfBoundsDatetime
+
from pandas import (
Categorical,
DataFrame,
@@ -781,3 +783,15 @@ def test_fillna_with_none_object(test_frame, dtype):
if test_frame:
expected = expected.to_frame()
tm.assert_equal(result, expected)
+
+
+def test_fillna_out_of_bounds_datetime():
+ # GH#61208
+ df = DataFrame(
+ {"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]}
+ )
+ df.iloc[0, 0] = None
+
+ msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow"
+ with pytest.raises(OutOfBoundsDatetime, match=msg):
+ df.fillna(Timestamp("0001-01-01"))
diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py
index c6e5304ae3cb4..08b7128e6ec11 100644
--- a/pandas/tests/frame/methods/test_nlargest.py
+++ b/pandas/tests/frame/methods/test_nlargest.py
@@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
index=[0, 0, 1, 1, 1],
)
result = df.nsmallest(n, order)
- expected = df.sort_values(order).head(n)
+ expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)
result = df.nlargest(n, order)
- expected = df.sort_values(order, ascending=False).head(n)
+ expected = df.sort_values(order, ascending=False, kind="stable").head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
):
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index 2f6998a85c80b..3be69617cad43 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -726,15 +726,16 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value):
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_iloc_mask(self):
- # GH 3631, iloc with a mask (of a series) should raise
+ # GH 60994, iloc with a mask (of a series) should return accordingly
df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
mask = df.a % 2 == 0
msg = "iLocation based boolean indexing cannot use an indexable as a mask"
with pytest.raises(ValueError, match=msg):
df.iloc[mask]
+
mask.index = range(len(mask))
- msg = "iLocation based boolean indexing on an integer type is not available"
- with pytest.raises(NotImplementedError, match=msg):
+ msg = "Unalignable boolean Series provided as indexer"
+ with pytest.raises(IndexingError, match=msg):
df.iloc[mask]
# ndarray ok
@@ -753,18 +754,13 @@ def test_iloc_mask(self):
(None, ".iloc"): "0b1100",
("index", ""): "0b11",
("index", ".loc"): "0b11",
- ("index", ".iloc"): (
- "iLocation based boolean indexing cannot use an indexable as a mask"
- ),
- ("locs", ""): "Unalignable boolean Series provided as indexer "
- "(index of the boolean Series and of the indexed "
- "object do not match).",
- ("locs", ".loc"): "Unalignable boolean Series provided as indexer "
- "(index of the boolean Series and of the "
- "indexed object do not match).",
- ("locs", ".iloc"): (
- "iLocation based boolean indexing on an integer type is not available"
- ),
+ (
+ "index",
+ ".iloc",
+ ): "iLocation based boolean indexing cannot use an indexable as a mask",
+ ("locs", ""): "Unalignable boolean Series provided as indexer",
+ ("locs", ".loc"): "Unalignable boolean Series provided as indexer",
+ ("locs", ".iloc"): "Unalignable boolean Series provided as indexer",
}
# UserWarnings from reindex of a boolean mask
@@ -780,18 +776,52 @@ def test_iloc_mask(self):
else:
accessor = df
answer = str(bin(accessor[mask]["nums"].sum()))
- except (ValueError, IndexingError, NotImplementedError) as err:
+ except (ValueError, IndexingError) as err:
answer = str(err)
key = (
idx,
method,
)
- r = expected.get(key)
- if r != answer:
- raise AssertionError(
- f"[{key}] does not match [{answer}], received [{r}]"
+ expected_result = expected.get(key)
+
+ # Fix the assertion to check for substring match
+ if (
+ idx is None or (idx == "index" and method != ".iloc")
+ ) and "0b" in expected_result:
+ # For successful numeric results, exact match is needed
+ assert expected_result == answer, (
+ f"[{key}] does not match [{answer}]"
)
+ else:
+ # For error messages, substring match is sufficient
+ assert expected_result in answer, f"[{key}] not found in [{answer}]"
+
+ def test_iloc_with_numpy_bool_array(self):
+ df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
+ result = df.iloc[np.array([True, False, True, False, True], dtype=bool)]
+ expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"])
+ tm.assert_frame_equal(result, expected)
+
+ def test_iloc_series_mask_with_index_mismatch_raises(self):
+ df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"])
+ mask = df.a % 2 == 0
+ msg = "Unalignable boolean Series provided as indexer"
+ with pytest.raises(IndexingError, match=msg):
+ df.iloc[Series([True] * len(mask), dtype=bool)]
+
+ def test_iloc_series_mask_all_true(self):
+ df = DataFrame(list(range(5)), columns=["a"])
+ mask = Series([True] * len(df), dtype=bool)
+ result = df.iloc[mask]
+ tm.assert_frame_equal(result, df)
+
+ def test_iloc_series_mask_alternate_true(self):
+ df = DataFrame(list(range(5)), columns=["a"])
+ mask = Series([True, False, True, False, True], dtype=bool)
+ result = df.iloc[mask]
+ expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4])
+ tm.assert_frame_equal(result, expected)
def test_iloc_non_unique_indexing(self):
# GH 4017, non-unique indexing (on the axis)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 3a68d38cc0bde..213fa2c01cef4 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -546,7 +546,7 @@ def test_na_values_dict_null_column_name(all_parsers):
parser = all_parsers
data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3"
names = [None, "x", "y"]
- na_values = {name: STR_NA_VALUES for name in names}
+ na_values = dict.fromkeys(names, STR_NA_VALUES)
dtype = {None: "object", "x": "float64", "y": "float64"}
if parser.engine == "pyarrow":
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index bb2058c050f2a..b3ab6b48508e1 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -7,8 +7,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import PY312
import pandas as pd
@@ -25,7 +23,6 @@
timedelta_range,
)
import pandas._testing as tm
-from pandas.conftest import has_pyarrow
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
@@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
-@pytest.mark.xfail(
- using_string_dtype() and has_pyarrow,
- reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
-)
@pytest.mark.parametrize("format", ["fixed", "table"])
-def test_to_hdf_errors(tmp_path, format, setup_path):
+def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
data = ["\ud800foo"]
- ser = Series(data, index=Index(data))
+ ser = Series(data, index=Index(data, dtype="object"), dtype="object")
path = tmp_path / setup_path
# GH 20835
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
result = read_hdf(path, "table", errors="surrogatepass")
- tm.assert_series_equal(result, ser)
+
+ if using_infer_string:
+ # https://github.com/pandas-dev/pandas/pull/60993
+ # Surrogates fallback to python storage.
+ dtype = pd.StringDtype(storage="python", na_value=np.nan)
+ else:
+ dtype = "object"
+ expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
+ tm.assert_series_equal(result, expected)
def test_create_table_index(setup_path):
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index d18f098267599..3f274a336ad44 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -840,14 +840,26 @@ def test_plot_scatter_shape(self):
axes = df.plot(x="x", y="y", kind="scatter", subplots=True)
_check_axes_shape(axes, axes_num=1, layout=(1, 1))
- def test_raise_error_on_datetime_time_data(self):
- # GH 8113, datetime.time type is not supported by matplotlib in scatter
+ def test_scatter_on_datetime_time_data(self):
+ # datetime.time type is now supported in scatter, since a converter
+ # is implemented in ScatterPlot
df = DataFrame(np.random.default_rng(2).standard_normal(10), columns=["a"])
df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time
- msg = "must be a string or a (real )?number, not 'datetime.time'"
+ df.plot(kind="scatter", x="dtime", y="a")
- with pytest.raises(TypeError, match=msg):
- df.plot(kind="scatter", x="dtime", y="a")
+ def test_scatter_line_xticks(self):
+ # GH#61005
+ df = DataFrame(
+ [(datetime(year=2025, month=1, day=1, hour=n), n) for n in range(3)],
+ columns=["datetime", "y"],
+ )
+ fig, ax = plt.subplots(2, sharex=True)
+ df.plot.scatter(x="datetime", y="y", ax=ax[0])
+ scatter_xticks = ax[0].get_xticks()
+ df.plot(x="datetime", y="y", ax=ax[1])
+ line_xticks = ax[1].get_xticks()
+ assert scatter_xticks[0] == line_xticks[0]
+ assert scatter_xticks[-1] == line_xticks[-1]
@pytest.mark.parametrize("x, y", [("dates", "vals"), (0, 1)])
def test_scatterplot_datetime_data(self, x, y):
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index 3a7fd548ca961..f871c0bf0218c 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -2155,6 +2155,16 @@ def test_arrow_timestamp_resample(tz):
tm.assert_series_equal(result, expected)
+@td.skip_if_no("pyarrow")
+def test_arrow_timestamp_resample_keep_index_name():
+ # https://github.com/pandas-dev/pandas/issues/61222
+ idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]")
+ expected = Series(np.arange(5, dtype=np.float64), index=idx)
+ expected.index.name = "index_name"
+ result = expected.resample("1D").mean()
+ tm.assert_series_equal(result, expected)
+
+
@pytest.mark.parametrize("freq", ["1A", "2A-MAR"])
def test_resample_A_raises(freq):
msg = f"Invalid frequency: {freq[1:]}"
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 46eee13755b2d..614200ae5b7c2 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -15,6 +15,7 @@
import pandas as pd
from pandas import (
+ ArrowDtype,
Categorical,
DataFrame,
Grouper,
@@ -2851,3 +2852,31 @@ def test_pivot_margins_with_none_index(self):
),
)
tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+ def test_pivot_with_pyarrow_categorical(self):
+ # GH#53051
+ pa = pytest.importorskip("pyarrow")
+
+ df = DataFrame(
+ {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+ ).astype(
+ {
+ "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+ "number_column": "float[pyarrow]",
+ }
+ )
+
+ df = df.pivot(columns=["string_column"], values=["number_column"])
+
+ multi_index = MultiIndex.from_arrays(
+ [["number_column", "number_column", "number_column"], ["A", "B", "C"]],
+ names=(None, "string_column"),
+ )
+ df_expected = DataFrame(
+ [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]],
+ columns=multi_index,
+ )
+ tm.assert_frame_equal(
+ df, df_expected, check_dtype=False, check_column_type=False
+ )
diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py
index ce8ea27ea1fa2..f017ccd963972 100644
--- a/pandas/tests/series/accessors/test_cat_accessor.py
+++ b/pandas/tests/series/accessors/test_cat_accessor.py
@@ -40,7 +40,7 @@ def test_getname_categorical_accessor(self, method):
def test_cat_accessor(self):
ser = Series(Categorical(["a", "b", np.nan, "a"]))
tm.assert_index_equal(ser.cat.categories, Index(["a", "b"]))
- assert not ser.cat.ordered, False
+ assert not ser.cat.ordered
exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index 84b60a2afe6eb..384b7ce3dc985 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -604,3 +604,27 @@ def test_map_kwargs():
result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2)
expected = Series([4, 6, 7])
tm.assert_series_equal(result, expected)
+
+
+def test_map_arg_as_kwarg():
+ with tm.assert_produces_warning(
+ FutureWarning, match="`arg` has been renamed to `func`"
+ ):
+ Series([1, 2]).map(arg={})
+
+
+def test_map_func_and_arg():
+ # `arg`is considered a normal kwarg that should be passed to the function
+ result = Series([1, 2]).map(lambda _, arg: arg, arg=3)
+ expected = Series([3, 3])
+ tm.assert_series_equal(result, expected)
+
+
+def test_map_no_func_or_arg():
+ with pytest.raises(ValueError, match="The `func` parameter is required"):
+ Series([1, 2]).map()
+
+
+def test_map_func_is_none():
+ with pytest.raises(ValueError, match="The `func` parameter is required"):
+ Series([1, 2]).map(func=None)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 5f4a100e7ccc7..f82451a2be84d 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -632,7 +632,7 @@ def test_constructor_maskedarray_hardened(self):
def test_series_ctor_plus_datetimeindex(self):
rng = date_range("20090415", "20090519", freq="B")
- data = {k: 1 for k in rng}
+ data = dict.fromkeys(rng, 1)
result = Series(data, index=rng)
assert result.index.is_(rng)
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index 76fad35304fe6..6282aecdfe977 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -4,6 +4,7 @@
import array
from functools import partial
+import importlib
import subprocess
import sys
@@ -186,41 +187,21 @@ def test_yaml_dump(df):
tm.assert_frame_equal(df, loaded2)
-@pytest.mark.single_cpu
-def test_missing_required_dependency():
- # GH 23868
- # To ensure proper isolation, we pass these flags
- # -S : disable site-packages
- # -s : disable user site-packages
- # -E : disable PYTHON* env vars, especially PYTHONPATH
- # https://github.com/MacPython/pandas-wheels/pull/50
-
- pyexe = sys.executable.replace("\\", "/")
-
- # We skip this test if pandas is installed as a site package. We first
- # import the package normally and check the path to the module before
- # executing the test which imports pandas with site packages disabled.
- call = [pyexe, "-c", "import pandas;print(pandas.__file__)"]
- output = subprocess.check_output(call).decode()
- if "site-packages" in output:
- pytest.skip("pandas installed as site package")
-
- # This test will fail if pandas is installed as a site package. The flags
- # prevent pandas being imported and the test will report Failed: DID NOT
- # RAISE
- call = [pyexe, "-sSE", "-c", "import pandas"]
-
- msg = (
- rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' "
- "returned non-zero exit status 1."
- )
+@pytest.mark.parametrize("dependency", ["numpy", "dateutil"])
+def test_missing_required_dependency(monkeypatch, dependency):
+ # GH#61030
+ original_import = __import__
+ mock_error = ImportError(f"Mock error for {dependency}")
+
+ def mock_import(name, *args, **kwargs):
+ if name == dependency:
+ raise mock_error
+ return original_import(name, *args, **kwargs)
- with pytest.raises(subprocess.CalledProcessError, match=msg) as exc:
- subprocess.check_output(call, stderr=subprocess.STDOUT)
+ monkeypatch.setattr("builtins.__import__", mock_import)
- output = exc.value.stdout.decode()
- for name in ["numpy", "dateutil"]:
- assert name in output
+ with pytest.raises(ImportError, match=dependency):
+ importlib.reload(importlib.import_module("pandas"))
def test_frame_setitem_dask_array_into_new_col(request):
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index a23e6d9b3973a..ff7ab22c197d8 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -5,6 +5,7 @@
import pandas as pd
from pandas import (
+ ArrowDtype,
DataFrame,
MultiIndex,
Series,
@@ -318,6 +319,34 @@ def test_multiindex_dt_with_nan(self):
expected = Series(["a", "b", "c", "d"], name=("sub", np.nan))
tm.assert_series_equal(result, expected)
+ @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+ def test_multiindex_with_pyarrow_categorical(self):
+ # GH#53051
+ pa = pytest.importorskip("pyarrow")
+
+ df = DataFrame(
+ {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+ ).astype(
+ {
+ "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+ "number_column": "float[pyarrow]",
+ }
+ )
+
+ df = df.set_index(["string_column", "number_column"])
+
+ df_expected = DataFrame(
+ index=MultiIndex.from_arrays(
+ [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"]
+ )
+ )
+ tm.assert_frame_equal(
+ df,
+ df_expected,
+ check_index_type=False,
+ check_column_type=False,
+ )
+
class TestSorted:
"""everything you wanted to test about sorting"""
diff --git a/pyproject.toml b/pyproject.toml
index b7d53b0d8934a..7db85f0037d33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,9 +72,9 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i
#'blosc>=1.20.1',
'tables>=3.8.0']
spss = ['pyreadstat>=1.2.0']
-postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
-mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2']
-sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0']
+postgresql = ['SQLAlchemy>=1.4.36', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
+mysql = ['SQLAlchemy>=1.4.36', 'pymysql>=1.0.2']
+sql-other = ['SQLAlchemy>=1.4.36', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0']
html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2']
xml = ['lxml>=4.9.2']
plot = ['matplotlib>=3.6.3']
@@ -113,7 +113,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'qtpy>=2.3.0',
'scipy>=1.10.0',
's3fs>=2022.11.0',
- 'SQLAlchemy>=2.0.0',
+ 'SQLAlchemy>=1.4.36',
'tables>=3.8.0',
'tabulate>=0.9.0',
'xarray>=2022.12.0',
@@ -148,7 +148,7 @@ setup = ['--vsenv'] # For Windows
[tool.cibuildwheel]
skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x"
-build-verbosity = "3"
+build-verbosity = 3
environment = {LDFLAGS="-Wl,--strip-all"}
test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0"
test-command = """
@@ -160,8 +160,8 @@ free-threaded-support = true
before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh"
[tool.cibuildwheel.windows]
+environment = {}
before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh"
-before-test = "bash {package}/scripts/cibw_before_test_windows.sh"
test-command = """
set PANDAS_CI='1' && \
python -c "import pandas as pd; \
@@ -234,8 +234,8 @@ select = [
"TID",
# implicit string concatenation
"ISC",
- # type-checking imports
- "TCH",
+ # flake8-type-checking
+ "TC",
# comprehensions
"C4",
# pygrep-hooks
@@ -390,6 +390,8 @@ ignore = [
"PLW0108",
# global-statement
"PLW0603",
+ # runtime-cast-value
+ "TC006",
]
exclude = [
@@ -429,7 +431,7 @@ exclude = [
"pandas/tests/*" = ["B028", "FLY"]
"scripts/*" = ["B028"]
# Keep this one enabled
-"pandas/_typing.py" = ["TCH"]
+"pandas/_typing.py" = ["TC"]
[tool.ruff.lint.flake8-pytest-style]
fixture-parentheses = false
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 20fc21be75a06..5607f2fe97fd9 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -43,7 +43,7 @@ pytz>=2023.4
pyxlsb>=1.0.10
s3fs>=2022.11.0
scipy>=1.10.0
-SQLAlchemy>=2.0.0
+SQLAlchemy>=1.4.36
tabulate>=0.9.0
xarray>=2022.12.0, <=2024.9.0
xlrd>=2.0.1
@@ -57,7 +57,7 @@ asv>=0.6.1
flake8==7.1.0
mypy==1.13.0
tokenize-rt
-pre-commit>=4.0.1
+pre-commit>=4.2.0
gitpython
gitdb
google-auth
diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh
index f9e1e68d8efba..dbf1d95d911bf 100644
--- a/scripts/cibw_before_build_windows.sh
+++ b/scripts/cibw_before_build_windows.sh
@@ -5,10 +5,11 @@ for file in $PACKAGE_DIR/LICENSES/*; do
done
# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13
-# and a NumPy Windows wheel for the free-threaded build on PyPI.
FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
if [[ $FREE_THREADED_BUILD == "True" ]]; then
python -m pip install -U pip
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython
- python -m pip install ninja meson-python versioneer[toml]
+ # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
+ # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests
+ python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136
+ python -m pip install ninja meson-python versioneer[toml] numpy
fi
diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh
deleted file mode 100644
index 8878e3950452f..0000000000000
--- a/scripts/cibw_before_test_windows.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI.
-FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
-if [[ $FREE_THREADED_BUILD == "True" ]]; then
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
-fi
diff --git a/setup.py b/setup.py
index 737ebd270d1e4..db1852b43cfa9 100755
--- a/setup.py
+++ b/setup.py
@@ -364,7 +364,7 @@ def run(self) -> None:
# enable coverage by building cython files by setting the environment variable
# "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext
# with `--with-cython-coverage`enabled
-linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False)
+linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) # noqa: PLW1508
if "--with-cython-coverage" in sys.argv:
linetrace = True
sys.argv.remove("--with-cython-coverage")
diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md
index 1e63832a5a2ba..5a8198a979d90 100644
--- a/web/pandas/community/benchmarks.md
+++ b/web/pandas/community/benchmarks.md
@@ -36,9 +36,8 @@ available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page.
Results of the benchmarks are available at:
-- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/)
-- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can
- also be visualized in this [Conbench PoC](http://57.128.112.95:5000/)
+- GitHub Actions results: [asv](https://pandas-dev.github.io/asv-runner/)
+- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/)
### Original server configuration
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index c6dddd5c2ef9f..3555d67c70620 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -469,7 +469,7 @@ read_record.data
df.dtypes
```
-ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/).
+ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/processing/#arcticdb.QueryBuilder).
### [Hugging Face](https://huggingface.co/datasets)
@@ -655,7 +655,7 @@ Pandas provides an interface for defining
The following libraries implement that interface to provide types not found in NumPy or pandas,
which work well with pandas' data containers.
-### [awkward-pandas](https://awkward-pandas.readthedocs.io/)
+### [awkward-pandas](https://github.com/scikit-hep/awkward)
Awkward-pandas provides an extension type for storing [Awkward
Arrays](https://awkward-array.org/) inside pandas' Series and
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index 679778330b68d..cb5447591dab6 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -146,11 +146,6 @@ sponsors:
url: https://numfocus.org/
logo: static/img/partners/numfocus.svg
kind: numfocus
- - name: "Coiled"
- url: https://www.coiled.io
- logo: static/img/partners/coiled.svg
- kind: partner
- description: "Patrick Hoefler"
- name: "Nvidia"
url: https://www.nvidia.com
logo: static/img/partners/nvidia.svg
@@ -192,5 +187,20 @@ sponsors:
- name: "d-fine GmbH"
url: https://www.d-fine.com/en/
kind: partner
+ - name: "Two Sigma"
+ url: https://www.twosigma.com/
+ kind: partner
+ - name: "Voltron Data"
+ url: https://voltrondata.com/
+ kind: partner
+ - name: "Intel"
+ url: https://www.intel.com/
+ kind: partner
+ - name: "Chan Zuckerberg Initiative"
+ url: https://chanzuckerberg.com/
+ kind: regular
+ - name: "Coiled"
+ url: https://www.coiled.io
+ kind: partner
roadmap:
pdeps_path: pdeps
diff --git a/web/pandas/index.html b/web/pandas/index.html
index bbd8632e06840..c520a16b8160f 100644
--- a/web/pandas/index.html
+++ b/web/pandas/index.html
@@ -96,6 +96,11 @@ Recommended books
+
+
+
+
+
diff --git a/web/pandas/static/img/books/pandas_cookbook_3.gif b/web/pandas/static/img/books/pandas_cookbook_3.gif
new file mode 100644
index 0000000000000..aa9d351d489e0
Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.gif differ
diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg
deleted file mode 100644
index 2d76ce150084b..0000000000000
--- a/web/pandas/static/img/partners/coiled.svg
+++ /dev/null
@@ -1,234 +0,0 @@
-
-
-
-