diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09bfda1755e03..5308c98e96937 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -minimum_pre_commit_version: 2.15.0 +minimum_pre_commit_version: 4.0.0 exclude: ^LICENSES/|\.(html|csv|svg)$ # reserve "manual" for relatively slow hooks which we still want to run in CI default_stages: [ @@ -19,13 +19,13 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.9 + rev: v0.11.4 hooks: - id: ruff args: [--exit-non-zero-on-fix] exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff - # TODO: remove autofixe-only rules when they are checked by ruff + # TODO: remove autofix only rules when they are checked by ruff name: ruff-selected-autofixes alias: ruff-selected-autofixes files: ^pandas @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.14' + rev: v2.14 hooks: - id: vulture entry: python scripts/run_vulture.py @@ -95,14 +95,14 @@ repos: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.7 + rev: v20.1.0 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include args: [-i] types_or: [c, c++] - repo: https://github.com/trim21/pre-commit-mirror-meson - rev: v1.7.0 + rev: v1.7.2 hooks: - id: meson-fmt args: ['--inplace'] diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 6a2ab24df26fe..cd7851acae3f2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -517,7 +517,7 @@ def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) - self.df2 = DataFrame({i: self.s for i in range(1028)}) + self.df2 = DataFrame(dict.fromkeys(range(1028), self.s)) self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2c32eb4f0c584..a0d23aa0478d2 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,9 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Timestamp.max PR02" \ - -i "pandas.Timestamp.min PR02" \ - -i "pandas.Timestamp.resolution PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml index c7c72828db481..2aadf42a510eb 100644 --- a/ci/deps/actions-310-minimum_versions.yaml +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -52,7 +52,7 @@ dependencies: - pyxlsb=1.0.10 - s3fs=2022.11.0 - scipy=1.10.0 - - sqlalchemy=2.0.0 + - sqlalchemy=1.4.36 - tabulate=0.9.0 - xarray=2022.12.0 - xlrd=2.0.1 diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 74cab4e0970dc..5688d3143e621 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -50,7 +50,7 @@ dependencies: - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 - - sqlalchemy>=2.0.0 + - sqlalchemy>=1.4.36 - tabulate>=0.9.0 - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 092ca18d61259..7713ae0232623 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -51,7 +51,7 @@ dependencies: - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 - - sqlalchemy>=2.0.0 + - sqlalchemy>=1.4.36 - tabulate>=0.9.0 - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index b6f515dceaea9..c160eae364ba2 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -50,7 +50,7 @@ dependencies: - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 - - sqlalchemy>=2.0.0 + - sqlalchemy>=1.4.36 - tabulate>=0.9.0 - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index bc66f8a5382c9..034653d207c0b 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -50,7 +50,7 @@ dependencies: - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 - - sqlalchemy>=2.0.0 + - sqlalchemy>=1.4.36 - tabulate>=0.9.0 - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bda959f380e8a..5d11e9574091e 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -287,7 +287,7 @@ Traditional drivers are installable with ``pip install "pandas[postgresql, mysql ================================================================== ================== =============== ============================================ Dependency Minimum Version pip extra Notes ================================================================== ================== =============== ============================================ -`SQLAlchemy `__ 2.0.0 postgresql, SQL support for databases other than sqlite +`SQLAlchemy `__ 1.4.36 postgresql, SQL support for databases other than sqlite mysql, sql-other `psycopg2 `__ 2.9.6 postgresql PostgreSQL engine for sqlalchemy diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index fc180c8161a7e..004651ac0074f 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -79,6 +79,8 @@ Function application DataFrameGroupBy.cumsum DataFrameGroupBy.describe DataFrameGroupBy.diff + DataFrameGroupBy.ewm + DataFrameGroupBy.expanding DataFrameGroupBy.ffill DataFrameGroupBy.first DataFrameGroupBy.head @@ -130,6 +132,8 @@ Function application SeriesGroupBy.cumsum SeriesGroupBy.describe SeriesGroupBy.diff + SeriesGroupBy.ewm + SeriesGroupBy.expanding SeriesGroupBy.ffill SeriesGroupBy.first SeriesGroupBy.head diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 230332319e0ac..d830dd8277ea9 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -38,6 +38,7 @@ Other enhancements - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`) +- Reverted the minimum version for the ``sqlalchemy`` optional dependency back to ``1.4.36`` (:issue:`57049`) - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e6fafc8b1b14c..2d74be6f503a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -61,6 +61,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering. - :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) @@ -68,6 +69,7 @@ Other enhancements - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) +- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) @@ -420,6 +422,7 @@ Other Deprecations - Deprecated lowercase strings ``w``, ``w-mon``, ``w-tue``, etc. denoting frequencies in :class:`Week` in favour of ``W``, ``W-MON``, ``W-TUE``, etc. (:issue:`58998`) - Deprecated parameter ``method`` in :meth:`DataFrame.reindex_like` / :meth:`Series.reindex_like` (:issue:`58667`) - Deprecated strings ``w``, ``d``, ``MIN``, ``MS``, ``US`` and ``NS`` denoting units in :class:`Timedelta` in favour of ``W``, ``D``, ``min``, ``ms``, ``us`` and ``ns`` (:issue:`59051`) +- Deprecated the ``arg`` parameter of ``Series.map``; pass the added ``func`` argument instead. (:issue:`61260`) - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) .. --------------------------------------------------------------------------- @@ -591,6 +594,7 @@ Performance improvements - :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) @@ -621,6 +625,7 @@ Performance improvements - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) +- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) @@ -636,6 +641,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) - @@ -648,6 +654,7 @@ Datetimelike - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) +- Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`) - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) @@ -762,6 +769,7 @@ Plotting - Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`) - Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`) - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`) +- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`) - Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`) Groupby/resample/rolling @@ -773,6 +781,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`) - Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) diff --git a/environment.yml b/environment.yml index ca8f1996c61cf..704bf5d767b86 100644 --- a/environment.yml +++ b/environment.yml @@ -54,7 +54,7 @@ dependencies: - pyxlsb>=1.0.10 - s3fs>=2022.11.0 - scipy>=1.10.0 - - sqlalchemy>=2.0.0 + - sqlalchemy>=1.4.36 - tabulate>=0.9.0 - xarray>=2022.12.0, <=2024.9.0 - xlrd>=2.0.1 @@ -80,7 +80,7 @@ dependencies: - flake8=7.1.0 # run in subprocess over docstring examples - mypy=1.13.0 # pre-commit uses locally installed mypy - tokenize-rt # scripts/check_for_inconsistent_pandas_namespace.py - - pre-commit>=4.0.1 + - pre-commit>=4.2.0 # documentation - gitpython # obtain contributors from git for whatsnew diff --git a/pandas/__init__.py b/pandas/__init__.py index c570fb8d70204..5dc6a8c3bc50c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -4,19 +4,17 @@ # Let users know if they're missing any of our hard dependencies _hard_dependencies = ("numpy", "dateutil") -_missing_dependencies = [] for _dependency in _hard_dependencies: try: __import__(_dependency) except ImportError as _e: # pragma: no cover - _missing_dependencies.append(f"{_dependency}: {_e}") + raise ImportError( + f"Unable to import required dependency {_dependency}. " + "Please see the traceback for details." + ) from _e -if _missing_dependencies: # pragma: no cover - raise ImportError( - "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies) - ) -del _hard_dependencies, _dependency, _missing_dependencies +del _hard_dependencies, _dependency try: # numpy compat diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 979a5666661b2..c885543b2fc6d 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -3,7 +3,6 @@ from typing import ( ClassVar, Literal, TypeAlias, - TypeVar, overload, ) @@ -60,7 +59,6 @@ UnitChoices: TypeAlias = Literal[ "nanos", "nanosecond", ] -_S = TypeVar("_S", bound=timedelta) def get_unit_for_round(freq, creso: int) -> int: ... def disallow_ambiguous_unit(unit: str | None) -> None: ... @@ -95,11 +93,11 @@ class Timedelta(timedelta): _value: int # np.int64 # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]") def __new__( # type: ignore[misc] - cls: type[_S], + cls: type[Self], value=..., unit: str | None = ..., **kwargs: float | np.integer | np.floating, - ) -> _S | NaTType: ... + ) -> Self | NaTType: ... @classmethod def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ... @property diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 23197b9a55afc..390267db8267f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -200,8 +200,9 @@ class MinMaxReso: See also: timedeltas.MinMaxReso """ - def __init__(self, name): + def __init__(self, name, docstring): self._name = name + self.__doc__ = docstring def __get__(self, obj, type=None): cls = Timestamp @@ -216,11 +217,15 @@ class MinMaxReso: if obj is None: # i.e. this is on the class, default to nanos - return cls(val) + result = cls(val) elif self._name == "resolution": - return Timedelta._from_value_and_reso(val, obj._creso) + result = Timedelta._from_value_and_reso(val, obj._creso) else: - return Timestamp._from_value_and_reso(val, obj._creso, tz=None) + result = Timestamp._from_value_and_reso(val, obj._creso, tz=None) + + result.__doc__ = self.__doc__ + + return result def __set__(self, obj, value): raise AttributeError(f"{self._name} is not settable.") @@ -235,9 +240,74 @@ cdef class _Timestamp(ABCTimestamp): dayofweek = _Timestamp.day_of_week dayofyear = _Timestamp.day_of_year - min = MinMaxReso("min") - max = MinMaxReso("max") - resolution = MinMaxReso("resolution") # GH#21336, GH#21365 + _docstring_min = """ + Returns the minimum bound possible for Timestamp. + + This property provides access to the smallest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.min + Timestamp('1677-09-21 00:12:43.145224193') + """ + + _docstring_max = """ + Returns the maximum bound possible for Timestamp. + + This property provides access to the largest possible value that + can be represented by a Timestamp object. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.min: Returns the minimum bound possible for Timestamp. + Timestamp.resolution: Returns the smallest possible difference between + non-equal Timestamp objects. + + Examples + -------- + >>> pd.Timestamp.max + Timestamp('2262-04-11 23:47:16.854775807') + """ + + _docstring_reso = """ + Returns the smallest possible difference between non-equal Timestamp objects. + + The resolution value is determined by the underlying representation of time + units and is equivalent to Timedelta(nanoseconds=1). + + Returns + ------- + Timedelta + + See Also + -------- + Timestamp.max: Returns the maximum bound possible for Timestamp. + Timestamp.min: Returns the minimum bound possible for Timestamp. + + Examples + -------- + >>> pd.Timestamp.resolution + Timedelta('0 days 00:00:00.000000001') + """ + + min = MinMaxReso("min", _docstring_min) + max = MinMaxReso("max", _docstring_max) + resolution = MinMaxReso("resolution", _docstring_reso) # GH#21336, GH#21365 @property def value(self) -> int: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6b90389a62056..1d5a68c8f5d8a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -47,7 +47,7 @@ "pyxlsb": "1.0.10", "s3fs": "2022.11.0", "scipy": "1.10.0", - "sqlalchemy": "2.0.0", + "sqlalchemy": "1.4.36", "tables": "3.8.0", "tabulate": "0.9.0", "xarray": "2022.12.0", diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 76f2fdad591ff..e6847b380a7e8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -215,7 +215,7 @@ def _reconstruct_data( values = cls._from_sequence(values, dtype=dtype) # type: ignore[assignment] else: - values = values.astype(dtype, copy=False) + values = values.astype(dtype, copy=False) # type: ignore[assignment] return values diff --git a/pandas/core/apply.py b/pandas/core/apply.py index da6124307e3f1..2c96f1ef020ac 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -327,7 +327,7 @@ def transform(self) -> DataFrame | Series: if is_series: func = {com.get_callable_name(v) or v: v for v in func} else: - func = {col: func for col in obj} + func = dict.fromkeys(obj, func) if is_dict_like(func): func = cast(AggFuncTypeDict, func) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 8a920d1849bb3..eb5026454552c 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -102,7 +102,7 @@ def quantile_with_mask( interpolation=interpolation, ) - result = np.asarray(result) + result = np.asarray(result) # type: ignore[assignment] result = result.T return result @@ -196,7 +196,7 @@ def _nanquantile( # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ - _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) + _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) # type: ignore[arg-type] for (val, m) in zip(list(values), list(mask)) ] if values.dtype.kind == "f": diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4e6f20e6ad3dd..26585e7bab8e3 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -142,18 +142,12 @@ def view(self, dtype: Dtype | None = None) -> ArrayLike: dt64_values = arr.view(dtype) return DatetimeArray._simple_new(dt64_values, dtype=dtype) - elif lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype): from pandas.core.arrays import TimedeltaArray td64_values = arr.view(dtype) return TimedeltaArray._simple_new(td64_values, dtype=dtype) - - # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible - # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, - # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, - # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - return arr.view(dtype=dtype) # type: ignore[arg-type] + return arr.view(dtype=dtype) def take( self, diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 285c3fd465ffc..7da83e2257e30 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -44,7 +44,7 @@ def pyarrow_array_to_numpy_and_mask( mask = pyarrow.BooleanArray.from_buffers( pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset ) - mask = np.asarray(mask) + mask = np.asarray(mask) # type: ignore[assignment] else: mask = np.ones(len(arr), dtype=bool) return data, mask diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9295cf7873d98..d7187b57a69e4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2540,7 +2540,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): dummies_dtype = np.bool_ dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) dummies[indices] = True - dummies = dummies.reshape((n_rows, n_cols)) + dummies = dummies.reshape((n_rows, n_cols)) # type: ignore[assignment] result = type(self)(pa.array(list(dummies))) return result, uniques_sorted.to_pylist() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 42be07e03bad8..d0048e122051a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -596,7 +596,7 @@ def to_numpy( if copy or na_value is not lib.no_default: result = result.copy() if na_value is not lib.no_default: - result[self.isna()] = na_value + result[self.isna()] = na_value # type: ignore[index] return result # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 647530151d5f6..df1aa21e9203c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -452,7 +452,7 @@ def __init__( if isinstance(values, Index): arr = values._data._pa_array.combine_chunks() else: - arr = values._pa_array.combine_chunks() + arr = extract_array(values)._pa_array.combine_chunks() categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) @@ -1853,7 +1853,7 @@ def value_counts(self, dropna: bool = True) -> Series: count = np.bincount(obs, minlength=ncat or 0) else: count = np.bincount(np.where(mask, code, ncat)) - ix = np.append(ix, -1) + ix = np.append(ix, -1) # type: ignore[assignment] ix = coerce_indexer_dtype(ix, self.dtype.categories) ix_categorical = self._from_backing_data(ix) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b27bf19f2f593..994d7b1d0081c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2394,7 +2394,7 @@ def take( ) indices = np.asarray(indices, dtype=np.intp) - maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) # type: ignore[arg-type] if isinstance(maybe_slice, slice): freq = self._get_getitem_freq(maybe_slice) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index df40c9c11b117..b31c543188282 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -331,7 +331,7 @@ def _simple_new( # type: ignore[override] else: # DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC], # then values.dtype should be M8[us]. - assert dtype._creso == get_unit_from_dtype(values.dtype) + assert dtype._creso == get_unit_from_dtype(values.dtype) # type: ignore[union-attr] result = super()._simple_new(values, dtype) result._freq = freq @@ -542,7 +542,7 @@ def _unbox_scalar(self, value) -> np.datetime64: raise ValueError("'value' should be a Timestamp.") self._check_compatible_with(value) if value is NaT: - return np.datetime64(value._value, self.unit) + return np.datetime64(value._value, self.unit) # type: ignore[call-overload] else: return value.as_unit(self.unit, round_ok=False).asm8 @@ -813,10 +813,7 @@ def _add_offset(self, offset: BaseOffset) -> Self: try: res_values = offset._apply_array(values._ndarray) if res_values.dtype.kind == "i": - # error: Argument 1 to "view" of "ndarray" has incompatible type - # "dtype[datetime64] | DatetimeTZDtype"; expected - # "dtype[Any] | type[Any] | _SupportsDType[dtype[Any]]" - res_values = res_values.view(values.dtype) # type: ignore[arg-type] + res_values = res_values.view(values.dtype) except NotImplementedError: if get_option("performance_warnings"): warnings.warn( diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 07c875337e4f6..62e6119204bd5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -515,7 +515,7 @@ def tolist(self) -> list: if self.ndim > 1: return [x.tolist() for x in self] dtype = None if self._hasna else self._data.dtype - return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() + return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() # type: ignore[return-value] @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... @@ -1497,10 +1497,10 @@ def all( result = values.all(axis=axis) if skipna: - return result + return result # type: ignore[return-value] else: if not result or len(self) == 0 or not self._mask.any(): - return result + return result # type: ignore[return-value] else: return self.dtype.na_value diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index cc9fd2d5fb8b0..d4ef3003583c3 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -79,7 +79,7 @@ def _levels_to_axis( ax_coords = codes[valid_ilocs] ax_labels = ax_labels.tolist() - return ax_coords, ax_labels + return ax_coords, ax_labels # pyright: ignore[reportReturnType] def _to_ijv( diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d35083fd892a8..a39d64429d162 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -281,7 +281,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: ] # short-circuit to return all False array. - if not len(value_set): + if not value_set: return np.zeros(len(self), dtype=bool) result = pc.is_in( diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c5b3129c506c8..9012b9f36348a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -325,7 +325,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: raise ValueError("'value' should be a Timedelta.") self._check_compatible_with(value) if value is NaT: - return np.timedelta64(value._value, self.unit) + return np.timedelta64(value._value, self.unit) # type: ignore[call-overload] else: return value.as_unit(self.unit, round_ok=False).asm8 diff --git a/pandas/core/base.py b/pandas/core/base.py index 6cc28d4e46634..8304af48c39ac 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -875,7 +875,7 @@ def tolist(self) -> list: >>> idx.to_list() [1, 2, 3] """ - return self._values.tolist() + return self._values.tolist() # type: ignore[return-value] to_list = tolist diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6a45ef9325bec..884107d4bc6af 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9705,7 +9705,7 @@ def _where( # CoW: Make sure reference is not kept alive if cond.ndim == 1 and self.ndim == 2: cond = cond._constructor_expanddim( - {i: cond for i in range(len(self.columns))}, + dict.fromkeys(range(len(self.columns)), cond), copy=False, ) cond.columns = self.columns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1251403db6ff3..a1c1163435611 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2142,7 +2142,7 @@ def _wrap_applied_output_series( if stacked_values.dtype == object: # We'll have the DataFrame constructor do inference - stacked_values = stacked_values.tolist() + stacked_values = stacked_values.tolist() # type: ignore[assignment] result = self.obj._constructor(stacked_values, index=index, columns=columns) if not self.as_index: @@ -2505,7 +2505,7 @@ def _apply_to_column_groupbys(self, func) -> DataFrame: ) results = [func(sgb) for sgb in sgbs] - if not len(results): + if not results: # concat would raise res_df = DataFrame([], columns=columns, index=self._grouper.result_index) else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f9438b348c140..7d58d8f867c12 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1878,7 +1878,7 @@ def _apply_filter(self, indices, dropna): mask.fill(False) mask[indices.astype(int)] = True # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T # type: ignore[assignment] filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered @@ -3803,16 +3803,58 @@ def rolling( ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) def expanding(self, *args, **kwargs) -> ExpandingGroupby: """ - Return an expanding grouper, providing expanding - functionality per group. + Return an expanding grouper, providing expanding functionality per group. + + Arguments are the same as `:meth:DataFrame.rolling` except that ``step`` cannot + be specified. + + Parameters + ---------- + *args : tuple + Positional arguments passed to the expanding window constructor. + **kwargs : dict + Keyword arguments passed to the expanding window constructor. Returns ------- pandas.api.typing.ExpandingGroupby + An object that supports expanding transformations over each group. + + See Also + -------- + Series.expanding : Expanding transformations for Series. + DataFrame.expanding : Expanding transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").expanding().mean() + Value + Class + A 0 10.0 + 1 15.0 + 2 20.0 + B 3 40.0 + 4 45.0 + 5 50.0 """ from pandas.core.window import ExpandingGroupby @@ -3824,15 +3866,79 @@ def expanding(self, *args, **kwargs) -> ExpandingGroupby: ) @final - @Substitution(name="groupby") - @Appender(_common_see_also) def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby: """ Return an ewm grouper, providing ewm functionality per group. + Parameters + ---------- + *args : tuple + Positional arguments passed to the EWM window constructor. + **kwargs : dict + Keyword arguments passed to the EWM window constructor, such as: + + com : float, optional + Specify decay in terms of center of mass. + ``span``, ``halflife``, and ``alpha`` are alternative ways to specify + decay. + span : float, optional + Specify decay in terms of span. + halflife : float, optional + Specify decay in terms of half-life. + alpha : float, optional + Specify smoothing factor directly. + min_periods : int, default 0 + Minimum number of observations in the window required to have a value; + otherwise, result is ``np.nan``. + adjust : bool, default True + Divide by decaying adjustment factor to account for imbalance in + relative weights. + ignore_na : bool, default False + Ignore missing values when calculating weights. + times : str or array-like of datetime64, optional + Times corresponding to the observations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis along which the EWM function is applied. + Returns ------- pandas.api.typing.ExponentialMovingWindowGroupby + An object that supports exponentially weighted moving transformations over + each group. + + See Also + -------- + Series.ewm : EWM transformations for Series. + DataFrame.ewm : EWM transformations for DataFrames. + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "Class": ["A", "A", "A", "B", "B", "B"], + ... "Value": [10, 20, 30, 40, 50, 60], + ... } + ... ) + >>> df + Class Value + 0 A 10 + 1 A 20 + 2 A 30 + 3 B 40 + 4 B 50 + 5 B 60 + + >>> df.groupby("Class").ewm(com=0.5).mean() + Value + Class + A 0 10.000000 + 1 17.500000 + 2 26.153846 + B 3 40.000000 + 4 47.500000 + 5 56.153846 """ from pandas.core.window import ExponentialMovingWindowGroupby @@ -4441,11 +4547,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: ) if vals.ndim == 1: - out = out.ravel("K") + out = out.ravel("K") # type: ignore[assignment] if result_mask is not None: - result_mask = result_mask.ravel("K") + result_mask = result_mask.ravel("K") # type: ignore[assignment] else: - out = out.reshape(ncols, ngroups * nqs) + out = out.reshape(ncols, ngroups * nqs) # type: ignore[assignment] return post_processor(out, inference, result_mask, orig_vals) @@ -5175,8 +5281,8 @@ def diff( shifted = shifted.astype("float32") else: to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] - if len(to_coerce): - shifted = shifted.astype({c: "float32" for c in to_coerce}) + if to_coerce: + shifted = shifted.astype(dict.fromkeys(to_coerce, "float32")) return obj - shifted diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c4c7f73ee166c..75f3495041917 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1131,7 +1131,7 @@ def get_iterator(self, data: NDFrame): """ slicer = lambda start, edge: data.iloc[start:edge] - start = 0 + start: np.int64 | int = 0 for edge, label in zip(self.bins, self.binlabels): if label is not NaT: yield label, slicer(start, edge) @@ -1144,7 +1144,7 @@ def get_iterator(self, data: NDFrame): def indices(self): indices = collections.defaultdict(list) - i = 0 + i: np.int64 | int = 0 for label, bin in zip(self.binlabels, self.bins): if i < bin: if label is not NaT: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 88379164534f2..6fc638e85bc5e 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -131,8 +131,8 @@ def get_window_bounds( if closed in ["left", "neither"]: end -= 1 - end = np.clip(end, 0, num_values) - start = np.clip(start, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] + start = np.clip(start, 0, num_values) # type: ignore[assignment] return start, end @@ -402,7 +402,7 @@ def get_window_bounds( start = np.arange(0, num_values, step, dtype="int64") end = start + self.window_size if self.window_size: - end = np.clip(end, 0, num_values) + end = np.clip(end, 0, num_values) # type: ignore[assignment] return start, end @@ -488,7 +488,7 @@ def get_window_bounds( ) window_indices_start += len(indices) # Extend as we'll be slicing window like [start, end) - window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( + window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype( # type: ignore[assignment] np.int64, copy=False ) start_arrays.append(window_indices.take(ensure_platform_int(start))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 13811c28e6c1e..8c40b630e8cfd 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1279,14 +1279,7 @@ def interval_range( breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output - - # error: Argument 1 to "maybe_downcast_numeric" has incompatible type - # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]"; - # expected "ndarray[Any, Any]" [ - breaks = maybe_downcast_numeric( - breaks, # type: ignore[arg-type] - dtype, - ) + breaks = maybe_downcast_numeric(breaks, dtype) else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bbbcc4da9fb39..34a437ba40bd8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1582,11 +1582,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": - raise NotImplementedError( - "iLocation based boolean " - "indexing on an integer type " - "is not available" - ) + return raise ValueError( "iLocation based boolean indexing cannot use an indexable as a mask" ) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dc64da35e9725..6aa5062b8ed86 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -805,7 +805,7 @@ def replace_list( for x, y in zip(src_list, dest_list) if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] - if not len(pairs): + if not pairs: return [self.copy(deep=False)] src_len = len(pairs) - 1 @@ -1679,6 +1679,8 @@ def where(self, other, cond) -> list[Block]: try: res_values = arr._where(cond, other).T + except OutOfBoundsDatetime: + raise except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, (IntervalDtype, StringDtype)): @@ -1746,6 +1748,8 @@ def putmask(self, mask, new) -> list[Block]: try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) + except OutOfBoundsDatetime: + raise except (TypeError, ValueError): if self.ndim == 1 or self.shape[0] == 1: if isinstance(self.dtype, IntervalDtype): @@ -2094,7 +2098,7 @@ def _unstack( self.values.take( indices, allow_fill=needs_masking[i], fill_value=fill_value ), - BlockPlacement(place), + BlockPlacement(place), # type: ignore[arg-type] ndim=2, ) for i, (indices, place) in enumerate(zip(new_values, new_placement)) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 69da2be0306f6..35de97d570bd3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -634,7 +634,7 @@ def reorder_arrays( arr = np.empty(length, dtype=object) arr.fill(np.nan) else: - arr = arrays[k] + arr = arrays[k] # type: ignore[assignment] new_arrays.append(arr) arrays = new_arrays @@ -864,7 +864,7 @@ def _finalize_columns_and_data( # GH#26429 do not raise user-facing AssertionError raise ValueError(err) from err - if len(contents) and contents[0].dtype == np.object_: + if contents and contents[0].dtype == np.object_: contents = convert_object_array(contents, dtype=dtype) return contents, columns diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a3738bb25f56c..e238bb78bbdfa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1298,7 +1298,7 @@ def value_getitem(placement): # Defer setting the new values to enable consolidation self._iset_split_block(blkno_l, blk_locs, refs=refs) - if len(removed_blknos): + if removed_blknos: # Remove blocks & update blknos accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 02e7445f1d275..59516b16905dc 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Generic, + Literal, cast, final, ) @@ -54,7 +55,9 @@ class SelectN(Generic[NDFrameT]): - def __init__(self, obj: NDFrameT, n: int, keep: str) -> None: + def __init__( + self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] + ) -> None: self.obj = obj self.n = n self.keep = keep @@ -111,15 +114,25 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - dropped = self.obj.dropna() - nan_index = self.obj.drop(dropped.index) + # Save index and reset to default index to avoid performance impact + # from when index contains duplicates + original_index: Index = self.obj.index + default_index = self.obj.reset_index(drop=True) - # slow method - if n >= len(self.obj): + # Slower method used when taking the full length of the series + # In this case, it is equivalent to a sort. + if n >= len(default_index): ascending = method == "nsmallest" - return self.obj.sort_values(ascending=ascending).head(n) + result = default_index.sort_values(ascending=ascending, kind="stable").head( + n + ) + result.index = original_index.take(result.index) + return result + + # Fast method used in the general case + dropped = default_index.dropna() + nan_index = default_index.drop(dropped.index) - # fast method new_dtype = dropped.dtype # Similar to algorithms._ensure_data @@ -158,7 +171,7 @@ def compute(self, method: str) -> Series: else: kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind="mergesort")] + inds = ns[arr[ns].argsort(kind="stable")] if self.keep != "all": inds = inds[:n] @@ -173,7 +186,9 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result.index = original_index.take(result.index) + return result class SelectNFrame(SelectN[DataFrame]): @@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]): nordered : DataFrame """ - def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + def __init__( + self, + obj: DataFrame, + n: int, + keep: Literal["first", "last", "all"], + columns: IndexLabel, + ) -> None: super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] @@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="mergesort") + return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e2fb3b9a6fc0b..66609fa870f14 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -241,7 +241,8 @@ def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None: return None if is_valid.ndim == 2: - is_valid = is_valid.any(axis=1) # reduce axis 1 + # reduce axis 1 + is_valid = is_valid.any(axis=1) # type: ignore[assignment] if how == "first": idxpos = is_valid[::].argmax() @@ -404,10 +405,7 @@ def func(yvalues: np.ndarray) -> None: **kwargs, ) - # error: No overload variant of "apply_along_axis" matches - # argument types "Callable[[ndarray[Any, Any]], None]", - # "int", "ndarray[Any, Any]" - np.apply_along_axis(func, axis, data) # type: ignore[call-overload] + np.apply_along_axis(func, axis, data) def _index_to_interp_indices(index: Index, method: str) -> np.ndarray: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 753f7fb6cea1a..08e3beef99e60 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -518,6 +518,7 @@ def _wrap_result(self, result): if self._timegrouper._arrow_dtype is not None: result.index = result.index.astype(self._timegrouper._arrow_dtype) + result.index.name = self.obj.index.name return result diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 6a590ee5b227e..54704b274b74f 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -357,7 +357,7 @@ def get_empty_frame(data) -> DataFrame: if drop_first: # remove first GH12042 - dummy_mat = dummy_mat[:, 1:] + dummy_mat = dummy_mat[:, 1:] # type: ignore[assignment] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 09be82c59a5c6..68d61da0cf7dd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2921,9 +2921,7 @@ def _convert_arrays_and_get_rizer_klass( lk = lk.astype(dtype, copy=False) rk = rk.astype(dtype, copy=False) if isinstance(lk, BaseMaskedArray): - # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; - # expected type "Type[object]" - klass = _factorizers[lk.dtype.type] # type: ignore[index] + klass = _factorizers[lk.dtype.type] elif isinstance(lk.dtype, ArrowDtype): klass = _factorizers[lk.dtype.numpy_dtype.type] else: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60fe71a7ff28..d2a838b616426 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) - result = stack_reshape(frame, level, set_levels, stack_cols) + result: Series | DataFrame + if not isinstance(frame.columns, MultiIndex): + # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex. + # When columns are homogeneous EAs, we pass through object + # dtype but this is still slightly faster than the normal path. + if len(frame.columns) > 0 and frame._is_homogeneous_type: + dtype = frame._mgr.blocks[0].dtype + else: + dtype = None + result = frame._constructor_sliced( + frame._values.reshape(-1, order="F"), dtype=dtype + ) + else: + result = stack_reshape(frame, level, set_levels, stack_cols) # Construct the correct MultiIndex by combining the frame's index and # stacked columns. @@ -1018,6 +1031,8 @@ def stack_reshape( ------- The data of behind the stacked DataFrame. """ + # non-MultIndex takes a fast path. + assert isinstance(frame.columns, MultiIndex) # If we need to drop `level` from columns, it needs to be in descending order drop_levnums = sorted(level, reverse=True) @@ -1027,18 +1042,14 @@ def stack_reshape( if len(frame.columns) == 1: data = frame.copy(deep=False) else: - if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): - # GH#57750 - if the frame is an Index with tuples, .loc below will fail - column_indexer = idx - else: - # Take the data from frame corresponding to this idx value - if len(level) == 1: - idx = (idx,) - gen = iter(idx) - column_indexer = tuple( - next(gen) if k in set_levels else slice(None) - for k in range(frame.columns.nlevels) - ) + # Take the data from frame corresponding to this idx value + if len(level) == 1: + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in set_levels else slice(None) + for k in range(frame.columns.nlevels) + ) data = frame.loc[:, column_indexer] if len(level) < frame.columns.nlevels: diff --git a/pandas/core/series.py b/pandas/core/series.py index 03a2ce85a08c9..d6a982c65e9fd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -52,6 +52,9 @@ doc, set_module, ) +from pandas.util._exceptions import ( + find_stack_level, +) from pandas.util._validators import ( validate_ascending, validate_bool_kwarg, @@ -4320,7 +4323,7 @@ def unstack( def map( self, - arg: Callable | Mapping | Series, + func: Callable | Mapping | Series | None = None, na_action: Literal["ignore"] | None = None, **kwargs, ) -> Series: @@ -4333,8 +4336,8 @@ def map( Parameters ---------- - arg : function, collections.abc.Mapping subclass or Series - Mapping correspondence. + func : function, collections.abc.Mapping subclass or Series + Function or mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. @@ -4404,9 +4407,22 @@ def map( 3 I am a rabbit dtype: object """ - if callable(arg): - arg = functools.partial(arg, **kwargs) - new_values = self._map_values(arg, na_action=na_action) + if func is None: + if "arg" in kwargs: + # `.map(arg=my_func)` + func = kwargs.pop("arg") + warnings.warn( + "The parameter `arg` has been renamed to `func`, and it " + "will stop being supported in a future version of pandas.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + raise ValueError("The `func` parameter is required") + + if callable(func): + func = functools.partial(func, **kwargs) + new_values = self._map_values(func, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" ) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 0d8f42694ccb4..18983af12976c 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -476,7 +476,7 @@ def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0): zipped = zip(arr_values, mask) else: zipped = zip(arr_values.T, mask.T) - return np.array([_nanargminmax(v, m, func) for v, m in zipped]) + return np.array([_nanargminmax(v, m, func) for v, m in zipped]) # type: ignore[arg-type] return func(arr_values, axis=axis) return _nanargminmax(arr_values, mask, func) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ebcafce8f4de2..1dc6c1f08b49a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -89,7 +89,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a ``pandas`` ``DataFrame``. +Read an Excel file into a ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fb799361fea67..189dfc1dde6aa 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -566,7 +566,7 @@ def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceTyp result = {} elif isinstance(col_space, (int, str)): result = {"": col_space} - result.update({column: col_space for column in self.frame.columns}) + result.update(dict.fromkeys(self.frame.columns, col_space)) elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": @@ -1495,7 +1495,7 @@ def _format_strings(self) -> list[str]: fmt_values = values._format_native_types( na_rep=self.nat_rep, date_format=self.date_format ) - return fmt_values.tolist() + return fmt_values.tolist() # type: ignore[return-value] class _ExtensionArrayFormatter(_GenericArrayFormatter): diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index c9a6e94a0c7c1..eb579f7149d44 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -249,7 +249,7 @@ Print a concise summary of a {klass}. This method prints information about a {klass} including - the index dtype{type_sub}, non-null values and memory usage. + the index dtype{type_sub}, non-NA values and memory usage. {version_added_sub}\ Parameters diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 482ed316c7ce4..6752c83d5169b 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -1225,7 +1225,7 @@ def format( data = self.data.loc[subset] if not isinstance(formatter, dict): - formatter = {col: formatter for col in data.columns} + formatter = dict.fromkeys(data.columns, formatter) cis = self.columns.get_indexer_for(data.columns) ris = self.index.get_indexer_for(data.index) @@ -1411,7 +1411,7 @@ def format_index( return self # clear the formatter / revert to default and avoid looping if not isinstance(formatter, dict): - formatter = {level: formatter for level in levels_} + formatter = dict.fromkeys(levels_, formatter) else: formatter = { obj._get_level_number(level): formatter_ @@ -1708,7 +1708,7 @@ def format_index_names( return self # clear the formatter / revert to default and avoid looping if not isinstance(formatter, dict): - formatter = {level: formatter for level in levels_} + formatter = dict.fromkeys(levels_, formatter) else: formatter = { obj._get_level_number(level): formatter_ diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e7b5c7f06a79a..547d8c1fe3d19 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1468,7 +1468,7 @@ def detect_colspecs( shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] edge_pairs = list(zip(edges[::2], edges[1::2])) - return edge_pairs + return edge_pairs # type: ignore[return-value] def __next__(self) -> list[str]: # Argument 1 to "next" has incompatible type "Union[IO[str], diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a689cfbcb1418..c58b4a4be6df1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -39,6 +39,7 @@ ) from pandas._libs.lib import is_string_array from pandas._libs.tslibs import timezones +from pandas.compat import HAS_PYARROW from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import ( @@ -381,6 +382,13 @@ def read_hdf( DataFrame.to_hdf : Write a HDF file from a DataFrame. HDFStore : Low-level access to HDF files. + Notes + ----- + When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true, + and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding + to UTF-8, the resulting dtype will be + ``pd.StringDtype(storage="python", na_value=np.nan)``. + Examples -------- >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"]) # doctest: +SKIP @@ -1760,7 +1768,7 @@ def info(self) -> str: if self.is_open: lkeys = sorted(self.keys()) - if len(lkeys): + if lkeys: keys = [] values = [] @@ -2257,6 +2265,20 @@ def convert( # making an Index instance could throw a number of different errors try: new_pd_index = factory(values, **kwargs) + except UnicodeEncodeError as err: + if ( + errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + new_pd_index = factory( + values, + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') @@ -3170,12 +3192,29 @@ def read_index_node( **kwargs, ) else: - index = factory( - _unconvert_index( - data, kind, encoding=self.encoding, errors=self.errors - ), - **kwargs, - ) + try: + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs, + ) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=StringDtype(storage="python", na_value=np.nan), + **kwargs, + ) + else: + raise index.name = name @@ -3311,13 +3350,24 @@ def read( self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) - result = Series(values, index=index, name=self.name, copy=False) - if ( - using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array(values, skipna=True) - ): - result = result.astype(StringDtype(na_value=np.nan)) + try: + result = Series(values, index=index, name=self.name, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + result = Series( + values, + index=index, + name=self.name, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise return result def write(self, obj, **kwargs) -> None: @@ -4540,7 +4590,7 @@ def write_data(self, chunksize: int | None, dropna: bool = False) -> None: masks.append(mask.astype("u1", copy=False)) # consolidate masks - if len(masks): + if masks: mask = masks[0] for m in masks[1:]: mask = mask & m @@ -4660,7 +4710,7 @@ def delete( groups = list(diff[diff > 1].index) # 1 group - if not len(groups): + if not groups: groups = [0] # final element @@ -4764,7 +4814,24 @@ def read( values = values.reshape((1, values.shape[0])) if isinstance(values, (np.ndarray, DatetimeArray)): - df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + try: + df = DataFrame(values.T, columns=cols_, index=index_, copy=False) + except UnicodeEncodeError as err: + if ( + self.errors == "surrogatepass" + and get_option("future.infer_string") + and str(err).endswith("surrogates not allowed") + and HAS_PYARROW + ): + df = DataFrame( + values.T, + columns=cols_, + index=index_, + copy=False, + dtype=StringDtype(storage="python", na_value=np.nan), + ) + else: + raise elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) else: @@ -4774,23 +4841,10 @@ def read( assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) # If str / string dtype is stored in meta, use that. - converted = False for column in cols_: dtype = getattr(self.table.attrs, f"{column}_meta", None) if dtype in ["str", "string"]: df[column] = df[column].astype(dtype) - converted = True - # Otherwise try inference. - if ( - not converted - and using_string_dtype() - and isinstance(values, np.ndarray) - and is_string_array( - values, - skipna=True, - ) - ): - df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) if len(frames) == 1: @@ -5224,7 +5278,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd # encode if needed if len(data): data = ( - Series(data.ravel(), copy=False) + Series(data.ravel(), copy=False, dtype="object") .str.encode(encoding, errors) ._values.reshape(data.shape) ) @@ -5264,7 +5318,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - ser = Series(data, copy=False).str.decode(encoding, errors=errors) + ser = Series(data, copy=False).str.decode( + encoding, errors=errors, dtype="object" + ) data = ser.to_numpy() data.flags.writeable = True else: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 0e0f07c0f8ff3..7376843f7e8ff 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1901,7 +1901,7 @@ def prep_table( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) @@ -2615,7 +2615,7 @@ def _create_table_setup(self): ] ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] - if len(ix_cols): + if ix_cols: cnames = "_".join(ix_cols) cnames_br = ",".join([escape(c) for c in ix_cols]) create_stmts.append( @@ -2859,7 +2859,7 @@ def to_sql( # Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]"; expected type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]" - dtype = {col_name: dtype for col_name in frame} # type: ignore[misc] + dtype = dict.fromkeys(frame, dtype) # type: ignore[arg-type] else: dtype = cast(dict, dtype) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1035150302d2c..24aa848de1b4c 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -64,10 +64,9 @@ from pandas.plotting._matplotlib.misc import unpack_single_str_list from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( - decorate_axes, format_dateaxis, maybe_convert_index, - maybe_resample, + prepare_ts_data, use_dynamic_x, ) from pandas.plotting._matplotlib.tools import ( @@ -288,6 +287,21 @@ def __init__( self.data = self._ensure_frame(self.data) + from pandas.plotting import plot_params + + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) + + @final + def _is_ts_plot(self) -> bool: + # this is slightly deceptive + return not self.x_compat and self.use_index and self._use_dynamic_x() + + @final + def _use_dynamic_x(self) -> bool: + return use_dynamic_x(self._get_ax(0), self.data.index) + @final @staticmethod def _validate_sharex(sharex: bool | None, ax, by) -> bool: @@ -1324,10 +1338,20 @@ def __init__( c = self.data.columns[c] self.c = c + @register_pandas_matplotlib_converters def _make_plot(self, fig: Figure) -> None: x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] + from pandas import Series + + x_data = data[x] + s = Series(index=x_data) + if use_dynamic_x(ax, s.index): + s = maybe_convert_index(ax, s) + freq, s = prepare_ts_data(s, ax, self.kwds) + x_data = s.index + c_is_column = is_hashable(c) and c in self.data.columns color_by_categorical = c_is_column and isinstance( @@ -1344,7 +1368,7 @@ def _make_plot(self, fig: Figure) -> None: else: label = None - # if a list of non color strings is passed in as c, color points + # if a list of non-color strings is passed in as c, color points # by uniqueness of the strings, such same strings get same color create_colors = not self._are_valid_colors(c_values) if create_colors: @@ -1360,7 +1384,7 @@ def _make_plot(self, fig: Figure) -> None: ) scatter = ax.scatter( - data[x].values, + x_data.values, data[y].values, c=c_values, label=label, @@ -1520,23 +1544,9 @@ def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]: return "line" def __init__(self, data, **kwargs) -> None: - from pandas.plotting import plot_params - MPLPlot.__init__(self, data, **kwargs) if self.stacked: self.data = self.data.fillna(value=0) - self.x_compat = plot_params["x_compat"] - if "x_compat" in self.kwds: - self.x_compat = bool(self.kwds.pop("x_compat")) - - @final - def _is_ts_plot(self) -> bool: - # this is slightly deceptive - return not self.x_compat and self.use_index and self._use_dynamic_x() - - @final - def _use_dynamic_x(self) -> bool: - return use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self, fig: Figure) -> None: if self._is_ts_plot(): @@ -1626,15 +1636,8 @@ def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds): # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose - freq, data = maybe_resample(data, ax, kwds) + freq, data = prepare_ts_data(data, ax, kwds) - # Set ax with freq info - decorate_axes(ax, freq) - # digging deeper - if hasattr(ax, "left_ax"): - decorate_axes(ax.left_ax, freq) - if hasattr(ax, "right_ax"): - decorate_axes(ax.right_ax, freq) # TODO #54485 ax._plot_data.append((data, self._kind, kwds)) # type: ignore[attr-defined] diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 962f9711d9916..6e343b176b5eb 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -273,7 +273,7 @@ def _random_color(column: int) -> list[float]: """Get a random color represented as a list of length 3""" # GH17525 use common._random_state to avoid resetting the seed rs = com.random_state(column) - return rs.rand(3).tolist() + return rs.rand(3).tolist() # type: ignore[return-value] def _is_single_string_color(color: Color) -> bool: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index d95ccad2da565..beaf5b6259ef3 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -48,7 +48,6 @@ from pandas._typing import NDFrameT from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, @@ -231,8 +230,8 @@ def _get_freq(ax: Axes, series: Series): return freq, ax_freq -def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: - freq = _get_index_freq(data.index) +def use_dynamic_x(ax: Axes, index: Index) -> bool: + freq = _get_index_freq(index) ax_freq = _get_ax_freq(ax) if freq is None: # convert irregular if axes has freq info @@ -250,16 +249,15 @@ def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool: return False # FIXME: hack this for 0.10.1, creating more technical debt...sigh - if isinstance(data.index, ABCDatetimeIndex): + if isinstance(index, ABCDatetimeIndex): # error: "BaseOffset" has no attribute "_period_dtype_code" freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str) base = to_offset(freq_str, is_period=True)._period_dtype_code # type: ignore[attr-defined] - x = data.index if base <= FreqGroup.FR_DAY.value: - return x[:1].is_normalized - period = Period(x[0], freq_str) + return index[:1].is_normalized + period = Period(index[0], freq_str) assert isinstance(period, Period) - return period.to_timestamp().tz_localize(x.tz) == x[0] + return period.to_timestamp().tz_localize(index.tz) == index[0] return True @@ -366,3 +364,19 @@ def format_dateaxis( raise TypeError("index type not supported") plt.draw_if_interactive() + + +def prepare_ts_data( + series: Series, ax: Axes, kwargs: dict[str, Any] +) -> tuple[BaseOffset | str, Series]: + freq, data = maybe_resample(series, ax, kwargs) + + # Set ax with freq info + decorate_axes(ax, freq) + # digging deeper + if hasattr(ax, "left_ax"): + decorate_axes(ax.left_ax, freq) + if hasattr(ax, "right_ax"): + decorate_axes(ax.right_ax, freq) + + return freq, data diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2d47cd851ad10..dde1158dc7951 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -334,7 +334,7 @@ def test_apply_broadcast_scalars(float_frame): def test_apply_broadcast_scalars_axis1(float_frame): result = float_frame.apply(np.mean, axis=1, result_type="broadcast") m = float_frame.mean(axis=1) - expected = DataFrame({c: m for c in float_frame.columns}) + expected = DataFrame(dict.fromkeys(float_frame.columns, m)) tm.assert_frame_equal(result, expected) @@ -361,7 +361,7 @@ def test_apply_broadcast_lists_index(float_frame): ) m = list(range(len(float_frame.index))) expected = DataFrame( - {c: m for c in float_frame.columns}, + dict.fromkeys(float_frame.columns, m), dtype="float64", index=float_frame.index, ) diff --git a/pandas/tests/dtypes/cast/test_maybe_box_native.py b/pandas/tests/dtypes/cast/test_maybe_box_native.py index 3f62f31dac219..151586962d517 100644 --- a/pandas/tests/dtypes/cast/test_maybe_box_native.py +++ b/pandas/tests/dtypes/cast/test_maybe_box_native.py @@ -17,7 +17,7 @@ "obj,expected_dtype", [ (b"\x00\x10", bytes), - (int(4), int), + ((4), int), (np.uint(4), int), (np.int32(-4), int), (np.uint8(4), int), diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index c61cda83cf6e0..a5b22ac30d820 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -769,8 +769,8 @@ def test_empty_like(self): np.datetime64("NaT"), np.timedelta64("NaT"), ] - + [np.datetime64("NaT", unit) for unit in m8_units] - + [np.timedelta64("NaT", unit) for unit in m8_units] + + [np.datetime64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] + + [np.timedelta64("NaT", unit) for unit in m8_units] # type: ignore[call-overload] ) inf_vals = [ diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 2915c0585f373..a760cbc3995b3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import NumpyEADtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import ExtensionArray @@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype + if future_stack and isinstance(data.dtype, NumpyEADtype): + # GH#58817 future_stack=True constructs the result specifying the dtype + # using the dtype of the input; we thus get the underlying + # NumPy dtype as the result instead of the NumpyExtensionArray + assert result.dtype == df.iloc[:, 0].to_numpy().dtype + else: + assert result.dtype == df.iloc[:, 0].dtype else: assert all(result.dtypes == df.iloc[:, 0].dtype) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 67d1d45af1cb3..8915d6f205d65 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import OutOfBoundsDatetime + from pandas import ( Categorical, DataFrame, @@ -781,3 +783,15 @@ def test_fillna_with_none_object(test_frame, dtype): if test_frame: expected = expected.to_frame() tm.assert_equal(result, expected) + + +def test_fillna_out_of_bounds_datetime(): + # GH#61208 + df = DataFrame( + {"datetime": date_range("1/1/2011", periods=3, freq="h"), "value": [1, 2, 3]} + ) + df.iloc[0, 0] = None + + msg = "Cannot cast 0001-01-01 00:00:00 to unit='ns' without overflow" + with pytest.raises(OutOfBoundsDatetime, match=msg): + df.fillna(Timestamp("0001-01-01")) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index c6e5304ae3cb4..08b7128e6ec11 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request): index=[0, 0, 1, 1, 1], ) result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) + expected = df.sort_values(order, kind="stable").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2f6998a85c80b..3be69617cad43 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -726,15 +726,16 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value): @pytest.mark.filterwarnings("ignore::UserWarning") def test_iloc_mask(self): - # GH 3631, iloc with a mask (of a series) should raise + # GH 60994, iloc with a mask (of a series) should return accordingly df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) mask = df.a % 2 == 0 msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] + mask.index = range(len(mask)) - msg = "iLocation based boolean indexing on an integer type is not available" - with pytest.raises(NotImplementedError, match=msg): + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): df.iloc[mask] # ndarray ok @@ -753,18 +754,13 @@ def test_iloc_mask(self): (None, ".iloc"): "0b1100", ("index", ""): "0b11", ("index", ".loc"): "0b11", - ("index", ".iloc"): ( - "iLocation based boolean indexing cannot use an indexable as a mask" - ), - ("locs", ""): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the indexed " - "object do not match).", - ("locs", ".loc"): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the " - "indexed object do not match).", - ("locs", ".iloc"): ( - "iLocation based boolean indexing on an integer type is not available" - ), + ( + "index", + ".iloc", + ): "iLocation based boolean indexing cannot use an indexable as a mask", + ("locs", ""): "Unalignable boolean Series provided as indexer", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer", + ("locs", ".iloc"): "Unalignable boolean Series provided as indexer", } # UserWarnings from reindex of a boolean mask @@ -780,18 +776,52 @@ def test_iloc_mask(self): else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except (ValueError, IndexingError, NotImplementedError) as err: + except (ValueError, IndexingError) as err: answer = str(err) key = ( idx, method, ) - r = expected.get(key) - if r != answer: - raise AssertionError( - f"[{key}] does not match [{answer}], received [{r}]" + expected_result = expected.get(key) + + # Fix the assertion to check for substring match + if ( + idx is None or (idx == "index" and method != ".iloc") + ) and "0b" in expected_result: + # For successful numeric results, exact match is needed + assert expected_result == answer, ( + f"[{key}] does not match [{answer}]" ) + else: + # For error messages, substring match is sufficient + assert expected_result in answer, f"[{key}] not found in [{answer}]" + + def test_iloc_with_numpy_bool_array(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + tm.assert_frame_equal(result, expected) + + def test_iloc_series_mask_with_index_mismatch_raises(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[Series([True] * len(mask), dtype=bool)] + + def test_iloc_series_mask_all_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True] * len(df), dtype=bool) + result = df.iloc[mask] + tm.assert_frame_equal(result, df) + + def test_iloc_series_mask_alternate_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True, False, True, False, True], dtype=bool) + result = df.iloc[mask] + expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) + tm.assert_frame_equal(result, expected) def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 3a68d38cc0bde..213fa2c01cef4 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -546,7 +546,7 @@ def test_na_values_dict_null_column_name(all_parsers): parser = all_parsers data = ",x,y\n\nMA,1,2\nNA,2,1\nOA,,3" names = [None, "x", "y"] - na_values = {name: STR_NA_VALUES for name in names} + na_values = dict.fromkeys(names, STR_NA_VALUES) dtype = {None: "object", "x": "float64", "y": "float64"} if parser.engine == "pyarrow": diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index bb2058c050f2a..b3ab6b48508e1 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY312 import pandas as pd @@ -25,7 +23,6 @@ timedelta_range, ) import pandas._testing as tm -from pandas.conftest import has_pyarrow from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_store, @@ -385,20 +382,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path): tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]])) -@pytest.mark.xfail( - using_string_dtype() and has_pyarrow, - reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed", -) @pytest.mark.parametrize("format", ["fixed", "table"]) -def test_to_hdf_errors(tmp_path, format, setup_path): +def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string): data = ["\ud800foo"] - ser = Series(data, index=Index(data)) + ser = Series(data, index=Index(data, dtype="object"), dtype="object") path = tmp_path / setup_path # GH 20835 ser.to_hdf(path, key="table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") - tm.assert_series_equal(result, ser) + + if using_infer_string: + # https://github.com/pandas-dev/pandas/pull/60993 + # Surrogates fallback to python storage. + dtype = pd.StringDtype(storage="python", na_value=np.nan) + else: + dtype = "object" + expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype) + tm.assert_series_equal(result, expected) def test_create_table_index(setup_path): diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index d18f098267599..3f274a336ad44 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -840,14 +840,26 @@ def test_plot_scatter_shape(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) _check_axes_shape(axes, axes_num=1, layout=(1, 1)) - def test_raise_error_on_datetime_time_data(self): - # GH 8113, datetime.time type is not supported by matplotlib in scatter + def test_scatter_on_datetime_time_data(self): + # datetime.time type is now supported in scatter, since a converter + # is implemented in ScatterPlot df = DataFrame(np.random.default_rng(2).standard_normal(10), columns=["a"]) df["dtime"] = date_range(start="2014-01-01", freq="h", periods=10).time - msg = "must be a string or a (real )?number, not 'datetime.time'" + df.plot(kind="scatter", x="dtime", y="a") - with pytest.raises(TypeError, match=msg): - df.plot(kind="scatter", x="dtime", y="a") + def test_scatter_line_xticks(self): + # GH#61005 + df = DataFrame( + [(datetime(year=2025, month=1, day=1, hour=n), n) for n in range(3)], + columns=["datetime", "y"], + ) + fig, ax = plt.subplots(2, sharex=True) + df.plot.scatter(x="datetime", y="y", ax=ax[0]) + scatter_xticks = ax[0].get_xticks() + df.plot(x="datetime", y="y", ax=ax[1]) + line_xticks = ax[1].get_xticks() + assert scatter_xticks[0] == line_xticks[0] + assert scatter_xticks[-1] == line_xticks[-1] @pytest.mark.parametrize("x, y", [("dates", "vals"), (0, 1)]) def test_scatterplot_datetime_data(self, x, y): diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3a7fd548ca961..f871c0bf0218c 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2155,6 +2155,16 @@ def test_arrow_timestamp_resample(tz): tm.assert_series_equal(result, expected) +@td.skip_if_no("pyarrow") +def test_arrow_timestamp_resample_keep_index_name(): + # https://github.com/pandas-dev/pandas/issues/61222 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + expected.index.name = "index_name" + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("freq", ["1A", "2A-MAR"]) def test_resample_A_raises(freq): msg = f"Invalid frequency: {freq[1:]}" diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 46eee13755b2d..614200ae5b7c2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -15,6 +15,7 @@ import pandas as pd from pandas import ( + ArrowDtype, Categorical, DataFrame, Grouper, @@ -2851,3 +2852,31 @@ def test_pivot_margins_with_none_index(self): ), ) tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_pivot_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.pivot(columns=["string_column"], values=["number_column"]) + + multi_index = MultiIndex.from_arrays( + [["number_column", "number_column", "number_column"], ["A", "B", "C"]], + names=(None, "string_column"), + ) + df_expected = DataFrame( + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]], + columns=multi_index, + ) + tm.assert_frame_equal( + df, df_expected, check_dtype=False, check_column_type=False + ) diff --git a/pandas/tests/series/accessors/test_cat_accessor.py b/pandas/tests/series/accessors/test_cat_accessor.py index ce8ea27ea1fa2..f017ccd963972 100644 --- a/pandas/tests/series/accessors/test_cat_accessor.py +++ b/pandas/tests/series/accessors/test_cat_accessor.py @@ -40,7 +40,7 @@ def test_getname_categorical_accessor(self, method): def test_cat_accessor(self): ser = Series(Categorical(["a", "b", np.nan, "a"])) tm.assert_index_equal(ser.cat.categories, Index(["a", "b"])) - assert not ser.cat.ordered, False + assert not ser.cat.ordered exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 84b60a2afe6eb..384b7ce3dc985 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -604,3 +604,27 @@ def test_map_kwargs(): result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) expected = Series([4, 6, 7]) tm.assert_series_equal(result, expected) + + +def test_map_arg_as_kwarg(): + with tm.assert_produces_warning( + FutureWarning, match="`arg` has been renamed to `func`" + ): + Series([1, 2]).map(arg={}) + + +def test_map_func_and_arg(): + # `arg`is considered a normal kwarg that should be passed to the function + result = Series([1, 2]).map(lambda _, arg: arg, arg=3) + expected = Series([3, 3]) + tm.assert_series_equal(result, expected) + + +def test_map_no_func_or_arg(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map() + + +def test_map_func_is_none(): + with pytest.raises(ValueError, match="The `func` parameter is required"): + Series([1, 2]).map(func=None) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5f4a100e7ccc7..f82451a2be84d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -632,7 +632,7 @@ def test_constructor_maskedarray_hardened(self): def test_series_ctor_plus_datetimeindex(self): rng = date_range("20090415", "20090519", freq="B") - data = {k: 1 for k in rng} + data = dict.fromkeys(rng, 1) result = Series(data, index=rng) assert result.index.is_(rng) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 76fad35304fe6..6282aecdfe977 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -4,6 +4,7 @@ import array from functools import partial +import importlib import subprocess import sys @@ -186,41 +187,21 @@ def test_yaml_dump(df): tm.assert_frame_equal(df, loaded2) -@pytest.mark.single_cpu -def test_missing_required_dependency(): - # GH 23868 - # To ensure proper isolation, we pass these flags - # -S : disable site-packages - # -s : disable user site-packages - # -E : disable PYTHON* env vars, especially PYTHONPATH - # https://github.com/MacPython/pandas-wheels/pull/50 - - pyexe = sys.executable.replace("\\", "/") - - # We skip this test if pandas is installed as a site package. We first - # import the package normally and check the path to the module before - # executing the test which imports pandas with site packages disabled. - call = [pyexe, "-c", "import pandas;print(pandas.__file__)"] - output = subprocess.check_output(call).decode() - if "site-packages" in output: - pytest.skip("pandas installed as site package") - - # This test will fail if pandas is installed as a site package. The flags - # prevent pandas being imported and the test will report Failed: DID NOT - # RAISE - call = [pyexe, "-sSE", "-c", "import pandas"] - - msg = ( - rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' " - "returned non-zero exit status 1." - ) +@pytest.mark.parametrize("dependency", ["numpy", "dateutil"]) +def test_missing_required_dependency(monkeypatch, dependency): + # GH#61030 + original_import = __import__ + mock_error = ImportError(f"Mock error for {dependency}") + + def mock_import(name, *args, **kwargs): + if name == dependency: + raise mock_error + return original_import(name, *args, **kwargs) - with pytest.raises(subprocess.CalledProcessError, match=msg) as exc: - subprocess.check_output(call, stderr=subprocess.STDOUT) + monkeypatch.setattr("builtins.__import__", mock_import) - output = exc.value.stdout.decode() - for name in ["numpy", "dateutil"]: - assert name in output + with pytest.raises(ImportError, match=dependency): + importlib.reload(importlib.import_module("pandas")) def test_frame_setitem_dask_array_into_new_col(request): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index a23e6d9b3973a..ff7ab22c197d8 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import ( + ArrowDtype, DataFrame, MultiIndex, Series, @@ -318,6 +319,34 @@ def test_multiindex_dt_with_nan(self): expected = Series(["a", "b", "c", "d"], name=("sub", np.nan)) tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") + def test_multiindex_with_pyarrow_categorical(self): + # GH#53051 + pa = pytest.importorskip("pyarrow") + + df = DataFrame( + {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]} + ).astype( + { + "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())), + "number_column": "float[pyarrow]", + } + ) + + df = df.set_index(["string_column", "number_column"]) + + df_expected = DataFrame( + index=MultiIndex.from_arrays( + [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"] + ) + ) + tm.assert_frame_equal( + df, + df_expected, + check_index_type=False, + check_column_type=False, + ) + class TestSorted: """everything you wanted to test about sorting""" diff --git a/pyproject.toml b/pyproject.toml index b7d53b0d8934a..7db85f0037d33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,9 +72,9 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i #'blosc>=1.20.1', 'tables>=3.8.0'] spss = ['pyreadstat>=1.2.0'] -postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0'] -mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] -sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0'] +postgresql = ['SQLAlchemy>=1.4.36', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0'] +mysql = ['SQLAlchemy>=1.4.36', 'pymysql>=1.0.2'] +sql-other = ['SQLAlchemy>=1.4.36', 'adbc-driver-postgresql>=0.10.0', 'adbc-driver-sqlite>=0.8.0'] html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] xml = ['lxml>=4.9.2'] plot = ['matplotlib>=3.6.3'] @@ -113,7 +113,7 @@ all = ['adbc-driver-postgresql>=0.10.0', 'qtpy>=2.3.0', 'scipy>=1.10.0', 's3fs>=2022.11.0', - 'SQLAlchemy>=2.0.0', + 'SQLAlchemy>=1.4.36', 'tables>=3.8.0', 'tabulate>=0.9.0', 'xarray>=2022.12.0', @@ -148,7 +148,7 @@ setup = ['--vsenv'] # For Windows [tool.cibuildwheel] skip = "cp36-* cp37-* cp38-* cp39-* pp* *_i686 *_ppc64le *_s390x" -build-verbosity = "3" +build-verbosity = 3 environment = {LDFLAGS="-Wl,--strip-all"} test-requires = "hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0" test-command = """ @@ -160,8 +160,8 @@ free-threaded-support = true before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] +environment = {} before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh" -before-test = "bash {package}/scripts/cibw_before_test_windows.sh" test-command = """ set PANDAS_CI='1' && \ python -c "import pandas as pd; \ @@ -234,8 +234,8 @@ select = [ "TID", # implicit string concatenation "ISC", - # type-checking imports - "TCH", + # flake8-type-checking + "TC", # comprehensions "C4", # pygrep-hooks @@ -390,6 +390,8 @@ ignore = [ "PLW0108", # global-statement "PLW0603", + # runtime-cast-value + "TC006", ] exclude = [ @@ -429,7 +431,7 @@ exclude = [ "pandas/tests/*" = ["B028", "FLY"] "scripts/*" = ["B028"] # Keep this one enabled -"pandas/_typing.py" = ["TCH"] +"pandas/_typing.py" = ["TC"] [tool.ruff.lint.flake8-pytest-style] fixture-parentheses = false diff --git a/requirements-dev.txt b/requirements-dev.txt index 20fc21be75a06..5607f2fe97fd9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -43,7 +43,7 @@ pytz>=2023.4 pyxlsb>=1.0.10 s3fs>=2022.11.0 scipy>=1.10.0 -SQLAlchemy>=2.0.0 +SQLAlchemy>=1.4.36 tabulate>=0.9.0 xarray>=2022.12.0, <=2024.9.0 xlrd>=2.0.1 @@ -57,7 +57,7 @@ asv>=0.6.1 flake8==7.1.0 mypy==1.13.0 tokenize-rt -pre-commit>=4.0.1 +pre-commit>=4.2.0 gitpython gitdb google-auth diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh index f9e1e68d8efba..dbf1d95d911bf 100644 --- a/scripts/cibw_before_build_windows.sh +++ b/scripts/cibw_before_build_windows.sh @@ -5,10 +5,11 @@ for file in $PACKAGE_DIR/LICENSES/*; do done # TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13 -# and a NumPy Windows wheel for the free-threaded build on PyPI. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython - python -m pip install ninja meson-python versioneer[toml] + # python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + # TODO: Remove below and uncomment above once https://github.com/cython/cython/pull/6717 no longer breaks tests + python -m pip install git+https://github.com/cython/cython.git@3276b588720a053c78488e5de788605950f4b136 + python -m pip install ninja meson-python versioneer[toml] numpy fi diff --git a/scripts/cibw_before_test_windows.sh b/scripts/cibw_before_test_windows.sh deleted file mode 100644 index 8878e3950452f..0000000000000 --- a/scripts/cibw_before_test_windows.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# TODO: Delete when there's a NumPy Windows wheel for the free-threaded build on PyPI. -FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" -if [[ $FREE_THREADED_BUILD == "True" ]]; then - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy -fi diff --git a/setup.py b/setup.py index 737ebd270d1e4..db1852b43cfa9 100755 --- a/setup.py +++ b/setup.py @@ -364,7 +364,7 @@ def run(self) -> None: # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled -linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) +linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) # noqa: PLW1508 if "--with-cython-coverage" in sys.argv: linetrace = True sys.argv.remove("--with-cython-coverage") diff --git a/web/pandas/community/benchmarks.md b/web/pandas/community/benchmarks.md index 1e63832a5a2ba..5a8198a979d90 100644 --- a/web/pandas/community/benchmarks.md +++ b/web/pandas/community/benchmarks.md @@ -36,9 +36,8 @@ available at the [pandas sponsors]({{ base_url }}about/sponsors.html) page. Results of the benchmarks are available at: -- Original server: [asv](https://asv-runner.github.io/asv-collection/pandas/) -- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) (benchmarks results can - also be visualized in this [Conbench PoC](http://57.128.112.95:5000/) +- GitHub Actions results: [asv](https://pandas-dev.github.io/asv-runner/) +- OVH server: [asv](https://pandas.pydata.org/benchmarks/asv/) ### Original server configuration diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index c6dddd5c2ef9f..3555d67c70620 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -469,7 +469,7 @@ read_record.data df.dtypes ``` -ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). +ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/processing/#arcticdb.QueryBuilder). ### [Hugging Face](https://huggingface.co/datasets) @@ -655,7 +655,7 @@ Pandas provides an interface for defining The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. -### [awkward-pandas](https://awkward-pandas.readthedocs.io/) +### [awkward-pandas](https://github.com/scikit-hep/awkward) Awkward-pandas provides an extension type for storing [Awkward Arrays](https://awkward-array.org/) inside pandas' Series and diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 679778330b68d..cb5447591dab6 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -146,11 +146,6 @@ sponsors: url: https://numfocus.org/ logo: static/img/partners/numfocus.svg kind: numfocus - - name: "Coiled" - url: https://www.coiled.io - logo: static/img/partners/coiled.svg - kind: partner - description: "Patrick Hoefler" - name: "Nvidia" url: https://www.nvidia.com logo: static/img/partners/nvidia.svg @@ -192,5 +187,20 @@ sponsors: - name: "d-fine GmbH" url: https://www.d-fine.com/en/ kind: partner + - name: "Two Sigma" + url: https://www.twosigma.com/ + kind: partner + - name: "Voltron Data" + url: https://voltrondata.com/ + kind: partner + - name: "Intel" + url: https://www.intel.com/ + kind: partner + - name: "Chan Zuckerberg Initiative" + url: https://chanzuckerberg.com/ + kind: regular + - name: "Coiled" + url: https://www.coiled.io + kind: partner roadmap: pdeps_path: pdeps diff --git a/web/pandas/index.html b/web/pandas/index.html index bbd8632e06840..c520a16b8160f 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -96,6 +96,11 @@

Recommended books

Python for Data Analysis

+

+ + Pandas Cookbook, Third Edition + +

Effective pandas 2 diff --git a/web/pandas/static/img/books/pandas_cookbook_3.gif b/web/pandas/static/img/books/pandas_cookbook_3.gif new file mode 100644 index 0000000000000..aa9d351d489e0 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.gif differ diff --git a/web/pandas/static/img/partners/coiled.svg b/web/pandas/static/img/partners/coiled.svg deleted file mode 100644 index 2d76ce150084b..0000000000000 --- a/web/pandas/static/img/partners/coiled.svg +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -