From 4edc8217cb08fde2e9d4b4413a85e52dc9a633ea Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 29 Jul 2020 07:35:13 -0700 Subject: [PATCH 01/83] Backport PR #35452: DOC: Start 1.1.1 (#35458) Co-authored-by: Tom Augspurger --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.1.rst | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 doc/source/whatsnew/v1.1.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ad5bb5a5b2d72..8ce10136dd2bb 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.1 v1.1.0 Version 1.0 diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst new file mode 100644 index 0000000000000..443589308ad4c --- /dev/null +++ b/doc/source/whatsnew/v1.1.1.rst @@ -0,0 +1,54 @@ +.. _whatsnew_111: + +What's new in 1.1.1 (?) +----------------------- + +These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.bug_fixes: + +Bug fixes +~~~~~~~~~ + +**Datetimelike** + +- +- + +**Numeric** + +- +- + +**Plotting** + +- + +**Indexing** + +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.0..v1.1.1|HEAD From 2dcb4026f762e3a1fa0520c7ff29c5015ea935a0 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 30 Jul 2020 04:56:37 -0700 Subject: [PATCH 02/83] Backport PR #35272: CI: Unpin pytest (#35469) Co-authored-by: Simon Hawkins --- ci/deps/azure-36-32bit.yaml | 2 +- ci/deps/azure-36-locale.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-36-slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/azure-macos-36.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- ci/deps/travis-37-arm64.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- ci/deps/travis-38.yaml | 2 +- environment.yml | 2 +- pandas/_testing.py | 6 ++---- pandas/tests/groupby/test_categorical.py | 10 ++++------ pandas/util/_test_decorators.py | 18 ++++++++++-------- requirements-dev.txt | 2 +- setup.cfg | 2 +- 20 files changed, 33 insertions(+), 35 deletions(-) diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 2dc53f8181ac4..15704cf0d5427 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -23,4 +23,4 @@ dependencies: - pip - pip: - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index d31015fde4741..a9b9a5a47ccf5 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 23121b985492e..c086b3651afc3 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml index 0a6d1d13c8549..87bad59fa4873 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-36-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 4dbb6a5344976..6f64c81f299d1 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 451fb5884a4af..5cb58756a6ac1 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 81a27465f9e61..eeea249a19ca1 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.6.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 4d7e1d821037b..548660cabaa67 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 34fca631df6c1..5bbd0e2795d7e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 5f5ea8034cddf..177e0d3f4c0af 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-cov # this is only needed in the coverage build diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 6bc4aba733ee5..03a1e751b6a86 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index f434a03609b26..5cb53489be225 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.13 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index aaf706d61fe5c..e896233aac63c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index ac39a223cd086..b879c0f81dab2 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/environment.yml b/environment.yml index 53222624619de..3b088ca511be9 100644 --- a/environment.yml +++ b/environment.yml @@ -52,7 +52,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio diff --git a/pandas/_testing.py b/pandas/_testing.py index fc6df7a95e348..1cf9304ed2715 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -9,7 +9,7 @@ from shutil import rmtree import string import tempfile -from typing import Any, Callable, List, Optional, Type, Union, cast +from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -2880,9 +2880,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): return expected -def external_error_raised( - expected_exception: Type[Exception], -) -> Callable[[Type[Exception], None], None]: +def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7e4513da37dc9..0d447a70b540d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1294,9 +1294,7 @@ def test_get_nonexistent_category(): ) -def test_series_groupby_on_2_categoricals_unobserved( - reduction_func: str, observed: bool, request -): +def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request): # GH 17605 if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") @@ -1326,7 +1324,7 @@ def test_series_groupby_on_2_categoricals_unobserved( def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( - reduction_func: str, request + reduction_func, request ): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN @@ -1374,7 +1372,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( assert np.issubdtype(result.dtype, np.integer) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): # GH 23865 # GH 27075 # Ensure that df.groupby, when 'by' is two pd.Categorical variables, @@ -1402,7 +1400,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun @pytest.mark.parametrize("observed", [False, None]) def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( - reduction_func: str, observed: bool, request + reduction_func, observed, request ): # GH 23865 # GH 27075 diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index a4a1d83177c50..bdf633839b2cd 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -120,7 +120,9 @@ def _skip_if_no_scipy() -> bool: ) -def skip_if_installed(package: str) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_installed(package: str): """ Skip a test if a package is installed. @@ -134,7 +136,9 @@ def skip_if_installed(package: str) -> Callable: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_no(package: str, min_version: Optional[str] = None): """ Generic function to help skip tests when required packages are not present on the testing system. @@ -196,14 +200,12 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) -def skip_if_np_lt( - ver_str: str, reason: Optional[str] = None, *args, **kwds -) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_np_lt(ver_str: str, *args, reason: Optional[str] = None): if reason is None: reason = f"NumPy {ver_str} or greater required" - return pytest.mark.skipif( - _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds - ) + return pytest.mark.skipif(_np_version < LooseVersion(ver_str), *args, reason=reason) def parametrize_fixture_doc(*args): diff --git a/requirements-dev.txt b/requirements-dev.txt index 0c024d1b54637..7bf3df176b378 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=5.0.1,<6.0.0rc0 +pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 pytest-asyncio diff --git a/setup.cfg b/setup.cfg index 00af7f6f1b79a..ee5725e36d193 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,7 +105,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf +known_third_party = announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 From 000e674da05562be321f84a96ae5add0cdf725e4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 1 Aug 2020 02:23:35 -0700 Subject: [PATCH 03/83] Backport PR #35477: MAINT: Use float arange when required or intended (#35500) Co-authored-by: Kevin Sheppard --- pandas/tests/window/test_base_indexer.py | 4 ++-- pandas/tests/window/test_ewm.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 4a0212e890d3a..2300d8dd5529b 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -140,7 +140,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): ) def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=3) @@ -177,7 +177,7 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) @pytest.mark.parametrize("constructor", [Series, DataFrame]) def test_rolling_forward_skewness(constructor): - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=5) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 12c314d5e9ec9..69cd1d1ba069c 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -108,7 +108,7 @@ def test_ewma_halflife_without_times(halflife_with_times): @pytest.mark.parametrize("min_periods", [0, 2]) def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): halflife = halflife_with_times - data = np.arange(10) + data = np.arange(10.0) data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() From e29d9bba0b6e220a7a5cf9648eb8de6ea493590f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 1 Aug 2020 02:39:27 -0700 Subject: [PATCH 04/83] Backport PR #35470: CI: unpin isort 5 (#35134) (#35501) Co-authored-by: Fangchen Li --- asv_bench/benchmarks/frame_ctor.py | 2 +- asv_bench/benchmarks/gil.py | 8 +- asv_bench/benchmarks/io/parsers.py | 2 +- asv_bench/benchmarks/tslibs/normalize.py | 2 +- ci/code_checks.sh | 2 +- doc/source/development/contributing.rst | 4 +- environment.yml | 2 +- pandas/_config/config.py | 4 +- pandas/_libs/algos.pyx | 10 +-- pandas/_libs/groupby.pyx | 48 +++++++++--- pandas/_libs/hashing.pyx | 7 +- pandas/_libs/hashtable.pyx | 75 +++++++++---------- pandas/_libs/index.pyx | 8 +- pandas/_libs/internals.pyx | 3 + pandas/_libs/interval.pyx | 16 ++-- pandas/_libs/join.pyx | 9 ++- pandas/_libs/lib.pyx | 48 ++++++------ pandas/_libs/missing.pyx | 16 ++-- pandas/_libs/ops.pyx | 9 +-- pandas/_libs/parsers.pyx | 65 ++++++++++------ pandas/_libs/reduction.pyx | 9 ++- pandas/_libs/reshape.pyx | 2 + pandas/_libs/sparse.pyx | 15 +++- pandas/_libs/testing.pyx | 7 +- pandas/_libs/tslib.pyx | 19 ++--- pandas/_libs/tslibs/ccalendar.pyx | 2 +- pandas/_libs/tslibs/conversion.pyx | 58 +++++++++----- pandas/_libs/tslibs/fields.pyx | 29 ++++--- pandas/_libs/tslibs/nattype.pyx | 22 +++--- pandas/_libs/tslibs/np_datetime.pyx | 6 +- pandas/_libs/tslibs/offsets.pyx | 43 +++++++---- pandas/_libs/tslibs/parsing.pyx | 41 +++++----- pandas/_libs/tslibs/period.pyx | 62 +++++++-------- pandas/_libs/tslibs/strptime.pyx | 17 +++-- pandas/_libs/tslibs/timedeltas.pyx | 38 ++++++---- pandas/_libs/tslibs/timestamps.pyx | 62 ++++++++------- pandas/_libs/tslibs/timezones.pyx | 12 ++- pandas/_libs/tslibs/tzconversion.pyx | 18 +++-- pandas/_libs/tslibs/vectorized.pyx | 9 ++- pandas/_libs/window/aggregations.pyx | 9 ++- pandas/_libs/window/indexers.pyx | 3 +- pandas/_libs/writers.pyx | 2 +- pandas/_testing.py | 2 +- pandas/_typing.py | 10 ++- pandas/compat/pickle_compat.py | 2 +- pandas/core/apply.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 1 + pandas/core/arrays/interval.py | 1 + pandas/core/arrays/period.py | 1 + pandas/core/arrays/sparse/accessor.py | 7 +- pandas/core/config_init.py | 6 +- pandas/core/construction.py | 10 +-- pandas/core/dtypes/cast.py | 3 +- pandas/core/dtypes/dtypes.py | 12 ++- pandas/core/frame.py | 6 +- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/grouper.py | 1 - pandas/core/indexes/base.py | 2 +- pandas/core/internals/ops.py | 2 +- pandas/core/strings.py | 6 +- pandas/core/tools/datetimes.py | 5 +- pandas/core/util/hashing.py | 2 +- pandas/io/clipboard/__init__.py | 16 ++-- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_odfreader.py | 4 +- pandas/io/excel/_openpyxl.py | 2 +- pandas/io/excel/_xlrd.py | 4 +- pandas/io/formats/format.py | 2 +- pandas/io/formats/style.py | 2 +- pandas/io/html.py | 2 +- pandas/io/pytables.py | 2 +- pandas/io/sas/sas.pyx | 2 +- pandas/io/sql.py | 16 ++-- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 2 +- pandas/tests/api/test_api.py | 3 +- pandas/tests/arrays/interval/test_interval.py | 4 + pandas/tests/arrays/test_period.py | 3 + pandas/tests/frame/test_analytics.py | 4 +- .../tests/indexes/datetimes/test_datetime.py | 1 + .../indexing/multiindex/test_indexing_slow.py | 2 +- pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_gcs.py | 9 +-- pandas/tests/io/test_sql.py | 4 +- pandas/tests/plotting/common.py | 4 +- pandas/tests/plotting/test_frame.py | 8 +- pandas/tests/plotting/test_hist_method.py | 3 +- pandas/tests/plotting/test_misc.py | 10 ++- pandas/tests/plotting/test_series.py | 3 +- pandas/tests/series/indexing/test_datetime.py | 2 +- pandas/tests/series/methods/test_asof.py | 2 +- pandas/tests/series/test_arithmetic.py | 2 +- pandas/tests/test_downstream.py | 2 +- pandas/util/_doctools.py | 2 +- requirements-dev.txt | 2 +- 97 files changed, 612 insertions(+), 432 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..e0a2257b0ca1f 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -6,7 +6,7 @@ from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import Hour, Nano except ImportError: # For compatibility with older versions from pandas.core.datetools import * # noqa diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..5d9070de92ec7 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -7,14 +7,14 @@ try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..5390056ba36f2 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..9a206410d8775 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,5 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..69ce0f1adce22 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..1b0e36e7b6933 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -751,7 +751,7 @@ Imports are alphabetically sorted within these sections. As part of :ref:`Continuous Integration ` checks we run:: - isort --recursive --check-only pandas + isort --check-only pandas to check that imports are correctly formatted as per the `setup.cfg`. @@ -770,8 +770,6 @@ You should run:: to automatically format imports correctly. This will modify your local copy of the files. -The `--recursive` flag can be passed to sort all files in a directory. - Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: git diff upstream/master --name-only -- "*.py" | xargs -r isort diff --git a/environment.yml b/environment.yml index 3b088ca511be9..9efb995e29497 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort=4.3.21 # check that imports are in the right order + - isort>=5.2.1 # check that imports are in the right order - mypy=0.730 - pycodestyle # used by flake8 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f5e16cddeb04c..d7b73a0a685d3 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -442,8 +442,8 @@ def register_option( ValueError if `validator` is specified and `defval` is not a valid value. """ - import tokenize import keyword + import tokenize key = key.lower() @@ -660,8 +660,8 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ - from textwrap import wrap from itertools import groupby + from textwrap import wrap def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6b6ead795584f..7e90a8cc681ef 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1,11 +1,12 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport malloc, free -from libc.string cimport memmove from libc.math cimport fabs, sqrt +from libc.stdlib cimport free, malloc +from libc.string cimport memmove import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_FLOAT32, @@ -31,12 +32,11 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.util cimport numeric, get_nat - from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -46,7 +46,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) - +from pandas._libs.util cimport get_nat, numeric import pandas._libs.missing as missing diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7c57e6ee9dbfd..38cb973d6dde9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,27 +1,51 @@ import cython from cython import Py_ssize_t -from cython cimport floating -from libc.stdlib cimport malloc, free +from cython cimport floating +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) +from numpy cimport ( + complex64_t, + complex128_t, + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from numpy.math cimport NAN -cnp.import_array() -from pandas._libs.util cimport numeric, get_nat +cnp.import_array() -from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, - TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, - TIEBREAK_DENSE) -from pandas._libs.algos import (take_2d_axis1_float64_float64, - groupsort_indexer, tiebreakers) +from pandas._libs.algos cimport ( + TIEBREAK_AVERAGE, + TIEBREAK_DENSE, + TIEBREAK_FIRST, + TIEBREAK_MAX, + TIEBREAK_MIN, + TiebreakEnumType, + swap, +) +from pandas._libs.util cimport get_nat, numeric + +from pandas._libs.algos import ( + groupsort_indexer, + take_2d_axis1_float64_float64, + tiebreakers, +) from pandas._libs.missing cimport checknull + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a98820ca57895..f2af04d91a3e3 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -2,10 +2,13 @@ # at https://github.com/veorq/SipHash import cython -from libc.stdlib cimport malloc, free + +from libc.stdlib cimport free, malloc import numpy as np -from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array + +from numpy cimport import_array, ndarray, uint8_t, uint32_t, uint64_t + import_array() from pandas._libs.util cimport is_nan diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c3dcbb942d7fe..ffaf6d6505955 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,60 +1,57 @@ cimport cython - -from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Free - -from libc.stdlib cimport malloc, free +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.ref cimport Py_INCREF, PyObject +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t, float64_t +from numpy cimport float64_t, ndarray, uint8_t, uint32_t from numpy.math cimport NAN + cnp.import_array() +from pandas._libs cimport util from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, - kh_init_str, - kh_put_str, - kh_exist_str, - kh_get_str, - kh_destroy_str, - kh_resize_str, - kh_put_strbox, - kh_get_strbox, - kh_init_strbox, - kh_int64_t, - kh_init_int64, - kh_resize_int64, + kh_destroy_float64, kh_destroy_int64, - kh_get_int64, + kh_destroy_pymap, + kh_destroy_str, + kh_destroy_uint64, + kh_exist_float64, kh_exist_int64, - kh_put_int64, + kh_exist_pymap, + kh_exist_str, + kh_exist_uint64, kh_float64_t, - kh_exist_float64, - kh_put_float64, - kh_init_float64, kh_get_float64, - kh_destroy_float64, - kh_resize_float64, - kh_resize_uint64, - kh_exist_uint64, - kh_destroy_uint64, - kh_put_uint64, + kh_get_int64, + kh_get_pymap, + kh_get_str, + kh_get_strbox, kh_get_uint64, - kh_init_uint64, - kh_destroy_pymap, - kh_exist_pymap, + kh_init_float64, + kh_init_int64, kh_init_pymap, - kh_get_pymap, + kh_init_str, + kh_init_strbox, + kh_init_uint64, + kh_int64_t, + kh_put_float64, + kh_put_int64, kh_put_pymap, + kh_put_str, + kh_put_strbox, + kh_put_uint64, + kh_resize_float64, + kh_resize_int64, kh_resize_pymap, + kh_resize_str, + kh_resize_uint64, + kh_str_t, + khiter_t, ) - - -from pandas._libs cimport util - from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 35c4b73b47695..d6659cc1895b1 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,6 +1,7 @@ import warnings import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,17 +17,16 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - +from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timedeltas cimport _Timedelta - -from pandas._libs.hashtable cimport HashTable +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import algos, hashtable as _hash from pandas._libs.missing import checknull diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b4b490f49b12..4f27fde52414a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -5,12 +5,15 @@ from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx + cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np + cimport numpy as cnp from numpy cimport NPY_INT64, int64_t + cnp.import_array() from pandas._libs.algos import ensure_int64 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 95881ebf1385c..6867e8aba7411 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,7 +1,8 @@ import numbers from operator import le, lt -from cpython.datetime cimport PyDelta_Check, PyDateTime_IMPORT +from cpython.datetime cimport PyDateTime_IMPORT, PyDelta_Check + PyDateTime_IMPORT from cpython.object cimport ( @@ -16,8 +17,8 @@ from cpython.object cimport ( import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_QUICKSORT, @@ -30,22 +31,21 @@ from numpy cimport ( ndarray, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - from pandas._libs.hashtable cimport Int64Vector +from pandas._libs.tslibs.timedeltas cimport _Timedelta +from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( - is_integer_object, is_float_object, + is_integer_object, is_timedelta64_object, ) -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timedeltas cimport _Timedelta - _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 54892a7e4bc77..13c7187923473 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,6 +16,7 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs.algos import ( @@ -640,7 +641,11 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # ---------------------------------------------------------------------- from pandas._libs.hashtable cimport ( - HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + HashTable, + Int64HashTable, + PyObjectHashTable, + UInt64HashTable, +) ctypedef fused asof_t: uint8_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5ecbb2c3ffd35..5fa91ffee8ea8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -5,23 +5,24 @@ import warnings import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_RichCompareBool, Py_EQ -from cpython.ref cimport Py_INCREF -from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.iterator cimport PyIter_Check -from cpython.sequence cimport PySequence_Check -from cpython.number cimport PyNumber_Check - from cpython.datetime cimport ( - PyDateTime_Check, PyDate_Check, - PyTime_Check, - PyDelta_Check, + PyDateTime_Check, PyDateTime_IMPORT, + PyDelta_Check, + PyTime_Check, ) +from cpython.iterator cimport PyIter_Check +from cpython.number cimport PyNumber_Check +from cpython.object cimport Py_EQ, PyObject_RichCompareBool +from cpython.ref cimport Py_INCREF +from cpython.sequence cimport PySequence_Check +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM + PyDateTime_IMPORT import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_OBJECT, @@ -39,6 +40,7 @@ from numpy cimport ( uint8_t, uint64_t, ) + cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -63,28 +65,23 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 from pandas._libs cimport util -from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime -from pandas._libs.tslibs.nattype cimport ( - NPY_NAT, - c_NaT as NaT, - checknull_with_nat, -) -from pandas._libs.tslibs.conversion cimport convert_to_tsobject -from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.missing cimport ( + C_NA, checknull, - isnaobj, is_null_datetime64, is_null_timedelta64, - C_NA, + isnaobj, ) - +from pandas._libs.tslibs.conversion cimport convert_to_tsobject +from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat +from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.period cimport is_period_object +from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 +from pandas._libs.tslibs.timezones cimport tz_compare # constants that will be compared to potentially arbitrarily large # python int @@ -1317,8 +1314,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if not isinstance(value, list): value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) # make contiguous diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index fdd06fe631b97..760fab3781fd4 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,27 +1,25 @@ -import cython -from cython import Py_ssize_t - import numbers +import cython +from cython import Py_ssize_t import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t + cnp.import_array() from pandas._libs cimport util - - -from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, is_null_datetimelike, ) -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit - cdef: float64_t INF = np.inf float64_t NEGINF = -INF diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 658600cdfbe6c..d1f897d237c1b 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,18 +10,17 @@ from cpython.object cimport ( PyObject_RichCompareBool, ) - import cython from cython import Py_ssize_t - import numpy as np -from numpy cimport ndarray, uint8_t, import_array -import_array() +from numpy cimport import_array, ndarray, uint8_t + +import_array() -from pandas._libs.util cimport UINT8_MAX, is_nan from pandas._libs.missing cimport checknull +from pandas._libs.util cimport UINT8_MAX, is_nan @cython.wraparound(False) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6ffb036e01595..fa77af6bd5a25 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,8 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license import bz2 +from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC +from errno import ENOENT import gzip import io import os @@ -9,17 +11,14 @@ import time import warnings import zipfile -from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from errno import ENOENT - from libc.stdlib cimport free -from libc.string cimport strncpy, strlen, strcasecmp +from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode @@ -30,37 +29,59 @@ cdef extern from "Python.h": import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t + cnp.import_array() from pandas._libs cimport util -from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX + import pandas._libs.lib as lib from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, - kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, kh_resize_float64, - kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, + kh_destroy_float64, + kh_destroy_str, + kh_destroy_str_starts, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, - kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_exist_str, + kh_float64_t, + kh_get_float64, + kh_get_str, + kh_get_str_starts_item, + kh_get_strbox, + kh_init_float64, + kh_init_str, + kh_init_str_starts, + kh_init_strbox, + kh_put_float64, + kh_put_str, + kh_put_str_starts_item, + kh_put_strbox, + kh_resize_float64, + kh_resize_str_starts, + kh_str_starts_t, + kh_str_t, + kh_strbox_t, + khiter_t, +) + +from pandas.compat import _get_lzma_file, _import_lzma +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.concat import union_categoricals -from pandas.compat import _import_lzma, _get_lzma_file -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) - lzma = _import_lzma() cdef: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a01e0c5705dcf..7b36bc8baf891 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -2,15 +2,18 @@ from copy import copy from cython import Py_ssize_t -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport int64_t, ndarray + cnp.import_array() from pandas._libs cimport util -from pandas._libs.lib import maybe_convert_objects, is_scalar + +from pandas._libs.lib import is_scalar, maybe_convert_objects cdef _check_result_array(object obj, Py_ssize_t cnt): diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index da4dd00027395..5c6c15fb50fed 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -16,7 +16,9 @@ from numpy cimport ( ) import numpy as np + cimport numpy as cnp + cnp.import_array() from pandas._libs.lib cimport c_is_list_like diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 7c9575d921dc9..321d7c374d8ec 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -1,9 +1,18 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, - float64_t, float32_t) +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, +) + cnp.import_array() diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 785a4d1f8b923..64fc8d615ea9c 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,13 +1,16 @@ import math import numpy as np + from numpy cimport import_array + import_array() from pandas._libs.util cimport is_array -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.missing import array_equivalent, isna + cdef NUMERIC_TYPES = ( bool, @@ -129,6 +132,7 @@ cpdef assert_almost_equal(a, b, if not isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -181,6 +185,7 @@ cpdef assert_almost_equal(a, b, elif isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 35d5cd8f1e275..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,23 +7,20 @@ from cpython.datetime cimport ( datetime, tzinfo, ) + # import datetime C API PyDateTime_IMPORT cimport numpy as cnp from numpy cimport float64_t, int64_t, ndarray + import numpy as np + cnp.import_array() import pytz -from pandas._libs.util cimport ( - is_datetime64_object, - is_float_object, - is_integer_object, -) - from pandas._libs.tslibs.np_datetime cimport ( _string_to_dts, check_dts_bounds, @@ -34,9 +31,9 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.util cimport is_datetime64_object, is_float_object, is_integer_object from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( @@ -45,22 +42,18 @@ from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, get_datetime64_nanos, ) - from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) - from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.tzconversion cimport ( - tz_localize_to_utc_single, -) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single def _test_parse_iso8601(ts: str): diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 00cecd25e5225..6cce2f5e1fd95 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,7 @@ Cython implementations of functions resembling the stdlib calendar module import cython -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cc3d25e86340..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,44 +1,68 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int32_t, intp_t, ndarray +from numpy cimport int32_t, int64_t, intp_t, ndarray + cnp.import_array() import pytz # stdlib datetime imports -from cpython.datetime cimport (datetime, time, tzinfo, - PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT) + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + datetime, + time, + tzinfo, +) + PyDateTime_IMPORT from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, - _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64, - get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64, - NPY_DATETIMEUNIT, NPY_FR_ns) -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + NPY_DATETIMEUNIT, + NPY_FR_ns, + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_unit, + get_datetime64_value, + npy_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, +) -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_integer_object, is_float_object) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - maybe_get_tz, tz_compare, + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, + maybe_get_tz, + tz_compare, utc_pytz as UTC, ) +from pandas._libs.tslibs.util cimport ( + is_datetime64_object, + is_float_object, + is_integer_object, +) + from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - checknull_with_nat, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, ) - from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 1d1f900bc18b3..16fa05c3801c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -6,26 +6,37 @@ from locale import LC_TIME import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t +from numpy cimport int8_t, int32_t, int64_t, ndarray, uint32_t + cnp.import_array() from pandas._config.localization import set_locale -from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL +from pandas._libs.tslibs.ccalendar import DAYS_FULL, MONTHS_FULL + from pandas._libs.tslibs.ccalendar cimport ( - get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t, - month_offset, + dayofweek, + get_day_of_year, + get_days_in_month, get_firstbday, + get_iso_calendar, get_lastbday, + get_week_of_year, + is_leapyear, + iso_calendar_t, + month_offset, ) -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, - td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.np_datetime cimport ( + dt64_to_dtstruct, + npy_datetimestruct, + pandas_timedeltastruct, + td64_to_tdstruct, +) + from pandas._libs.tslibs.strptime import LocaleTime diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 264013f928d22..73df51832d700 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,3 +1,10 @@ +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.object cimport ( Py_EQ, Py_GE, @@ -8,28 +15,19 @@ from cpython.object cimport ( PyObject_RichCompare, ) -from cpython.datetime cimport ( - PyDateTime_Check, - PyDateTime_IMPORT, - PyDelta_Check, - datetime, - timedelta, -) PyDateTime_IMPORT from cpython.version cimport PY_MINOR_VERSION import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) cimport pandas._libs.tslibs.util as util - +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 31cc55ad981bb..12aaaf4ce3977 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,5 +1,3 @@ -from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE - from cpython.datetime cimport ( PyDateTime_DATE_GET_HOUR, PyDateTime_DATE_GET_MICROSECOND, @@ -10,11 +8,15 @@ from cpython.datetime cimport ( PyDateTime_GET_YEAR, PyDateTime_IMPORT, ) +from cpython.object cimport Py_EQ, Py_GE, Py_GT, Py_LE, Py_LT, Py_NE + PyDateTime_IMPORT from numpy cimport int64_t + from pandas._libs.tslibs.util cimport get_c_string_buf_and_size + cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, npy_datetimestruct *b) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9a7ca15a2a1c2..ac2725fc58aee 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,39 +1,51 @@ -import cython - import operator import re import time from typing import Any import warnings -from cpython.datetime cimport (PyDateTime_IMPORT, - PyDateTime_Check, - PyDate_Check, - PyDelta_Check, - datetime, timedelta, date, - time as dt_time) + +import cython + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + date, + datetime, + time as dt_time, + timedelta, +) + PyDateTime_IMPORT -from dateutil.relativedelta import relativedelta from dateutil.easter import easter - +from dateutil.relativedelta import relativedelta import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_integer_object, is_datetime64_object, is_float_object, + is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( - MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, + MONTH_ALIASES, + MONTH_TO_CAL_NUM, + int_to_weekday, + weekday_to_int, ) + from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, dayofweek, @@ -47,17 +59,20 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, pydate_to_dtstruct, ) from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds + from .timedeltas import Timedelta + from .timestamps cimport _Timestamp + from .timestamps import Timestamp # --------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c4f369d0d3b3f..8429aebbd85b8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -9,39 +9,44 @@ from libc.string cimport strchr import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_Str - from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo +from cpython.object cimport PyObject_Str from cpython.version cimport PY_VERSION_HEX + import_datetime() import numpy as np + cimport numpy as cnp -from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, - PyArray_IterNew, flatiter, float64_t) +from numpy cimport ( + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, + flatiter, + float64_t, +) + cnp.import_array() # dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) + +from dateutil.parser import DEFAULTPARSER, parse as du_parse from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse +from dateutil.tz import ( + tzlocal as _dateutil_tzlocal, + tzoffset, + tzstr as _dateutil_tzstr, + tzutc as _dateutil_tzutc, +) from pandas._config import get_option from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.nattype cimport ( - c_nat_strings as nat_strings, - c_NaT as NaT, -) -from pandas._libs.tslibs.util cimport ( - is_array, - get_c_string_buf_and_size, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT, c_nat_strings as nat_strings from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size, is_array + cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 20961c6da56bd..86b6533f5caf5 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,96 +1,98 @@ import warnings -from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompareBool +from numpy cimport import_array, int64_t, ndarray -from numpy cimport int64_t, import_array, ndarray import numpy as np + import_array() from libc.stdlib cimport free, malloc +from libc.string cimport memset, strlen from libc.time cimport strftime, tm -from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( - datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, + datetime, ) + # import datetime C API PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, - dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, - check_dts_bounds, NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) + cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timedeltas import Timedelta -from pandas._libs.tslibs.timedeltas cimport ( - delta_to_nanoseconds, - is_any_td_scalar, -) +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.ccalendar cimport ( + c_MONTH_NUMBERS, dayofweek, get_day_of_year, - is_leapyear, - get_week_of_year, get_days_in_month, + get_week_of_year, + is_leapyear, ) -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( - PeriodDtypeBase, - FR_UND, FR_ANN, - FR_QTR, - FR_MTH, - FR_WK, FR_BUS, FR_DAY, FR_HR, FR_MIN, - FR_SEC, FR_MS, - FR_US, + FR_MTH, FR_NS, + FR_QTR, + FR_SEC, + FR_UND, + FR_US, + FR_WK, + PeriodDtypeBase, attrname_to_abbrevs, ) - from pandas._libs.tslibs.parsing cimport get_rule_month + from pandas._libs.tslibs.parsing import parse_time_string + from pandas._libs.tslibs.nattype cimport ( - _nat_scalar_rules, NPY_NAT, - is_null_datetimelike, + _nat_scalar_rules, c_NaT as NaT, c_nat_strings as nat_strings, + is_null_datetimelike, ) from pandas._libs.tslibs.offsets cimport ( BaseOffset, - to_offset, - is_tick_object, is_offset_object, + is_tick_object, + to_offset, ) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG cdef: enum: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 660b582f73e6e..d2690be905a68 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,27 +1,30 @@ """Strptime-related classes and functions. """ -import time -import locale import calendar +import locale import re +import time from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock +import numpy as np import pytz -import numpy as np from numpy cimport int64_t -from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_nat_strings as nat_strings, + checknull_with_nat, ) +from pandas._libs.tslibs.np_datetime cimport ( + check_dts_bounds, + dtstruct_to_dt64, + npy_datetimestruct, +) + cdef dict _parse_code_table = {'y': 0, 'Y': 1, diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f3a599bf107c..ee32ed53a908b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2,39 +2,47 @@ import collections import cython -from cpython.object cimport Py_NE, Py_EQ, PyObject_RichCompare +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() -from cpython.datetime cimport (timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + timedelta, +) + PyDateTime_IMPORT cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - is_timedelta64_object, is_datetime64_object, is_integer_object, - is_float_object, is_array -) - from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.conversion cimport cast_from_unit - -from pandas._libs.tslibs.np_datetime cimport ( - cmp_scalar, td64_to_tdstruct, pandas_timedeltastruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, +) +from pandas._libs.tslibs.np_datetime cimport ( + cmp_scalar, + pandas_timedeltastruct, + td64_to_tdstruct, ) from pandas._libs.tslibs.offsets cimport is_tick_object +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8cef685933863..bddfc30d86a53 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -9,54 +9,66 @@ shadows the python class, where we do any heavy lifting. import warnings import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int8_t, uint8_t, ndarray -cnp.import_array() +from numpy cimport int8_t, int64_t, ndarray, uint8_t -from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_EQ, Py_NE) +cnp.import_array() -from cpython.datetime cimport ( - datetime, - time, - tzinfo, - tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below +from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below PyDateTime_Check, + PyDateTime_IMPORT, PyDelta_Check, PyTZInfo_Check, - PyDateTime_IMPORT, -) -PyDateTime_IMPORT - -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, is_array, + datetime, + time, + tzinfo as tzinfo_type, ) +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare, PyObject_RichCompareBool -from pandas._libs.tslibs.base cimport ABCTimestamp +PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar - +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, - convert_to_tsobject, convert_datetime_to_tsobject, + convert_to_tsobject, normalize_i8_stamp, ) -from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) + +from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field + from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, + check_dts_bounds, cmp_scalar, + dt64_to_dtstruct, + npy_datetimestruct, pydatetime_to_dt64, ) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object -from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds + +from pandas._libs.tslibs.offsets cimport is_offset_object, to_offset +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.timezones cimport ( - is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, - get_timezone, tz_compare, + get_timezone, + is_utc, + maybe_get_tz, + treat_tz_as_pytz, + tz_compare, + utc_pytz as UTC, ) from pandas._libs.tslibs.tzconversion cimport ( tz_convert_from_utc_single, diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index a8c785704d8e8..b82291a71057e 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,27 +1,31 @@ from datetime import timezone + from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat + from dateutil.tz import ( gettz as dateutil_gettz, tzfile as _dateutil_tzfile, tzlocal as _dateutil_tzlocal, tzutc as _dateutil_tzutc, ) - - -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo import pytz +from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + UTC = pytz.utc import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport is_integer_object, get_nat +from pandas._libs.tslibs.util cimport get_nat, is_integer_object + cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 606639af16a18..2b148cd8849f1 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,21 +5,27 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, + tzinfo, +) + PyDateTime_IMPORT -import pytz from dateutil.tz import tzutc - import numpy as np +import pytz + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, intp_t +from numpy cimport int64_t, intp_t, ndarray, uint8_t + cnp.import_array() from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) +from pandas._libs.tslibs.np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from pandas._libs.tslibs.timezones cimport ( get_dst_info, get_utcoffset, diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c8f8daf6724c2..bdc00f6c6e21a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,18 +1,21 @@ import cython -from cpython.datetime cimport datetime, date, time, tzinfo +from cpython.datetime cimport date, datetime, time, tzinfo import numpy as np + from numpy cimport int64_t, intp_t, ndarray from .conversion cimport normalize_i8_stamp + from .dtypes import Resolution + from .nattype cimport NPY_NAT, c_NaT as NaT -from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from .offsets cimport to_offset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .timezones cimport get_dst_info, is_tzlocal, is_utc from .tzconversion cimport tz_convert_utc_to_tzlocal # ------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 362d0e6263697..3ec4547d223ce 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,13 +2,15 @@ import cython from cython import Py_ssize_t -from libcpp.deque cimport deque -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc +from libcpp.deque cimport deque import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t + cnp.import_array() @@ -22,6 +24,7 @@ from pandas._libs.algos import is_monotonic from pandas._libs.util cimport numeric + cdef extern from "../src/skiplist.h": ctypedef struct node_t: node_t **next diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 8a1e7feb57ace..9af1159a805ec 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,7 +1,8 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import numpy as np -from numpy cimport ndarray, int64_t + +from numpy cimport int64_t, ndarray # Cython routines for window indexers diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 2d5b31d7ccbcf..40c39aabb7a7a 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -5,8 +5,8 @@ from cpython.bytes cimport PyBytes_GET_SIZE from cpython.unicode cimport PyUnicode_GET_SIZE import numpy as np -from numpy cimport ndarray, uint8_t +from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: str diff --git a/pandas/_testing.py b/pandas/_testing.py index 1cf9304ed2715..a020fbff3553a 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -535,7 +535,7 @@ def rands(nchars): def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close + from matplotlib.pyplot import close as _close, get_fignums if fignum is None: for fignum in get_fignums(): diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..76ec527e6e258 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -24,13 +24,15 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 - from pandas.core.arrays.base import ExtensionArray # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 # array-like diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 0484de3fa165d..015b203a60256 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -14,7 +14,7 @@ from pandas import Index if TYPE_CHECKING: - from pandas import Series, DataFrame + from pandas import DataFrame, Series def load_reduce(self): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 733dbeed34b72..6b8d7dc35fe95 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -15,7 +15,7 @@ from pandas.core.construction import create_series_with_explicit_dtype if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import DataFrame, Index, Series ResType = Dict[int, Any] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index db9cfd9d7fc59..6e5c7bc699962 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -520,7 +520,7 @@ def _from_inferred_categories( ------- Categorical """ - from pandas import Index, to_numeric, to_datetime, to_timedelta + from pandas import Index, to_datetime, to_numeric, to_timedelta cats = Index(inferred_categories) known_categories = ( @@ -1403,7 +1403,7 @@ def value_counts(self, dropna=True): -------- Series.value_counts """ - from pandas import Series, CategoricalIndex + from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ee4d43fdb3bc2..c6945e2f78b5a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -959,7 +959,7 @@ def value_counts(self, dropna=False): ------- Series """ - from pandas import Series, Index + from pandas import Index, Series if dropna: values = self[~self.isna()]._data diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0958af41158c..57df067c7b16e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -116,6 +116,7 @@ def __from_arrow__( Construct IntegerArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c861d25afd13f..ed2437cc061bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1105,6 +1105,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8d5cb12d60e4d..fe78481d99d30 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -300,6 +300,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8a30d2b954b55..da8d695c59b9e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -87,8 +87,8 @@ def from_coo(cls, A, dense_index=False): 1 0 3.0 dtype: Sparse[float64, nan] """ - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) @@ -253,9 +253,10 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame from pandas._libs.sparse import IntIndex + from pandas import DataFrame + data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape @@ -354,8 +355,8 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase from pandas.core.indexes.api import ensure_index + import pandas.core.indexes.base as ibase N, K = data.shape if index is None: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 86f6be77bc505..2b2431149e230 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -662,8 +662,10 @@ def register_plotting_backend_cb(key): def register_converter_cb(key): - from pandas.plotting import register_matplotlib_converters - from pandas.plotting import deregister_matplotlib_converters + from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, + ) if cf.get_option(key): register_matplotlib_converters() diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6c58698989e96..47f10f1f65f4a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -48,9 +48,9 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 def array( @@ -255,14 +255,14 @@ def array( ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, BooleanArray, + DatetimeArray, IntegerArray, IntervalArray, PandasArray, - DatetimeArray, - TimedeltaArray, StringArray, + TimedeltaArray, + period_array, ) if lib.is_scalar(data): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6b84f0e81f48b..228329898b6a4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1244,6 +1244,7 @@ def try_datetime(v): # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex try: @@ -1303,8 +1304,8 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime + from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: if isinstance(dtype, str): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22480fbc47508..8350e136417b1 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -30,12 +30,13 @@ if TYPE_CHECKING: import pyarrow # noqa: F401 + + from pandas import Categorical # noqa: F401 from pandas.core.arrays import ( # noqa: F401 + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, ) - from pandas import Categorical # noqa: F401 str_type = str @@ -391,12 +392,13 @@ def __repr__(self) -> str_type: @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype + from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, + hash_array, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -939,6 +941,7 @@ def __from_arrow__( Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1136,6 +1139,7 @@ def __from_arrow__( Construct IntervalArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..3e4c9393f74de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,6 +150,7 @@ if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -5204,8 +5205,9 @@ def duplicated( 4 True dtype: bool """ + from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT if self.empty: return self._constructor_sliced(dtype=bool) @@ -7867,8 +7869,8 @@ def join( def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): - from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..c50b753cf3293 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -681,8 +681,8 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..8239a792c65dd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -237,7 +237,6 @@ def __new__(cls, *args, **kwargs): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means # that the call comes from a pandas internal call of resample, # otherwise it comes from pd.Grouper diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 986d6323e704e..1be381e38b157 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5731,9 +5731,9 @@ def _maybe_cast_data_without_dtype(subarr): """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, TimedeltaArray, ) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..6eedf72726acb 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -5,8 +5,8 @@ from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.managers import BlockManager # noqa:F401 from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.managers import BlockManager # noqa:F401 def operate_blockwise( diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..6702bf519c52e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray, BooleanArray + from pandas.arrays import BooleanArray, IntegerArray, StringArray mask = isna(arr) @@ -2186,7 +2186,7 @@ def _wrap_result( returns_string=True, ): - from pandas import Index, Series, MultiIndex + from pandas import Index, MultiIndex, Series # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -2292,7 +2292,7 @@ def _get_series_list(self, others): list of Series Others transformed into list of Series. """ - from pandas import Series, DataFrame + from pandas import DataFrame, Series # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adab143f6052..7aac2f793f61a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,9 +53,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas import Series # noqa:F401 + # --------------------------------------------------------------------- # types used in annotations @@ -876,7 +877,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): ------- Series """ - from pandas import to_timedelta, to_numeric, DataFrame + from pandas import DataFrame, to_numeric, to_timedelta arg = DataFrame(arg) if not arg.columns.is_unique: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1b56b6d5a46fa..d79b9f4092325 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -275,7 +275,7 @@ def hash_array( # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: - from pandas import factorize, Categorical, Index + from pandas import Categorical, Index, factorize codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40bff5a75709b..d16955a98b62f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -311,17 +311,17 @@ def init_windows_clipboard(): global HGLOBAL, LPVOID, DWORD, LPCSTR, INT global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE from ctypes.wintypes import ( - HGLOBAL, - LPVOID, + BOOL, DWORD, - LPCSTR, - INT, - HWND, + HANDLE, + HGLOBAL, HINSTANCE, HMENU, - BOOL, + HWND, + INT, + LPCSTR, + LPVOID, UINT, - HANDLE, ) windll = ctypes.windll @@ -528,8 +528,8 @@ def determine_clipboard(): # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": try: - import Foundation # check if pyobjc is installed import AppKit + import Foundation # check if pyobjc is installed except ImportError: return init_osx_pbcopy_clipboard() else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..b1bbda4a4b7e0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -834,8 +834,8 @@ class ExcelFile: from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._xlrd import _XlrdReader from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { "xlrd": _XlrdReader, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..44abaf5d3b3c9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -191,9 +191,9 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Text, Element - from odf.text import S, P + from odf.element import Element, Text from odf.namespaces import TEXTNS + from odf.text import P, S text_p = P().qname text_s = S().qname diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..03a30cbd62f9a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -225,7 +225,7 @@ def _convert_to_fill(cls, fill_dict): ------- fill : openpyxl.styles.Fill """ - from openpyxl.styles import PatternFill, GradientFill + from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..af82c15fd6b66 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -48,11 +48,11 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): from xlrd import ( - xldate, + XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_ERROR, - XL_CELL_BOOLEAN, XL_CELL_NUMBER, + xldate, ) epoch1904 = self.book.datemode diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fe85eab4bfbf5..c05f79f935548 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -72,7 +72,7 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas import Series, DataFrame, Categorical + from pandas import Categorical, DataFrame, Series FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d11144938eb26..fd1efa2d1b668 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -42,8 +42,8 @@ try: - import matplotlib.pyplot as plt from matplotlib import colors + import matplotlib.pyplot as plt has_mpl = True except ImportError: diff --git a/pandas/io/html.py b/pandas/io/html.py index 3193f52d239f1..8354cf413814e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -707,8 +707,8 @@ def _build_doc(self): -------- pandas.io.html._HtmlFrameParser._build_doc """ - from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + from lxml.html import HTMLParser, fromstring, parse parser = HTMLParser(recover=True, encoding=self.encoding) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b67a1c5781d91..e0df4c29e543e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,7 @@ from pandas.io.formats.printing import adjoin, pprint_thing if TYPE_CHECKING: - from tables import File, Node, Col # noqa:F401 + from tables import Col, File, Node # noqa:F401 # versioning attribute diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 0038e39e2ffcc..17b41fd2b4379 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,8 +1,8 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython import Py_ssize_t - import numpy as np + import pandas.io.sas.sas_constants as const ctypedef signed long long int64_t diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9177696ca13d6..c87391eaa62b1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -937,7 +937,7 @@ def _get_column_names_and_types(self, dtype_mapper): return column_names_and_types def _create_table_setup(self): - from sqlalchemy import Table, Column, PrimaryKeyConstraint + from sqlalchemy import Column, PrimaryKeyConstraint, Table column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) @@ -1026,15 +1026,15 @@ def _sqlalchemy_type(self, col): col_type = lib.infer_dtype(col, skipna=True) from sqlalchemy.types import ( + TIMESTAMP, BigInteger, - Integer, - Float, - Text, Boolean, - DateTime, Date, + DateTime, + Float, + Integer, + Text, Time, - TIMESTAMP, ) if col_type == "datetime64" or col_type == "datetime": @@ -1079,7 +1079,7 @@ def _sqlalchemy_type(self, col): return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + from sqlalchemy.types import TIMESTAMP, Boolean, Date, DateTime, Float, Integer if isinstance(sqltype, Float): return float @@ -1374,7 +1374,7 @@ def to_sql( dtype = {col_name: dtype for col_name in frame} if dtype is not None: - from sqlalchemy.types import to_instance, TypeEngine + from sqlalchemy.types import TypeEngine, to_instance for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 353bc8a8936a5..b490e07e43753 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1149,8 +1149,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( - _maybe_resample, _decorate_axes, + _maybe_resample, format_dateaxis, ) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 8f3571cf13cbc..95f9fbf3995ed 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,7 +24,7 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Series, Index # noqa:F401 + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ecd20796b6f21..caa348d3a1fb9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,9 +267,10 @@ def test_sparsearray(): def test_np(): - import numpy as np import warnings + import numpy as np + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d517eaaec68d2..0176755b54dd1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -142,6 +142,7 @@ def test_repr(): @pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType p1 = ArrowIntervalType(pa.int64(), "left") @@ -158,6 +159,7 @@ def test_arrow_extension_type(): @pyarrow_skip def test_arrow_array(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array @@ -187,6 +189,7 @@ def test_arrow_array(): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) @@ -221,6 +224,7 @@ def test_arrow_array_missing(): ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 8887dd0278afe..0d81e8e733842 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -359,6 +359,7 @@ def test_arrow_extension_type(): ) def test_arrow_array(data, freq): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType periods = period_array(data, freq=freq) @@ -384,6 +385,7 @@ def test_arrow_array(data, freq): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") @@ -399,6 +401,7 @@ def test_arrow_array_missing(): @pyarrow_skip def test_arrow_table_roundtrip(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9d6b9f39a0578..52a1e3aae9058 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -287,7 +287,7 @@ def test_stat_op_api(self, float_frame, float_string_frame): assert_stat_op_api("median", float_frame, float_string_frame) try: - from scipy.stats import skew, kurtosis # noqa:F401 + from scipy.stats import kurtosis, skew # noqa:F401 assert_stat_op_api("skew", float_frame, float_string_frame) assert_stat_op_api("kurt", float_frame, float_string_frame) @@ -370,7 +370,7 @@ def kurt(x): ) try: - from scipy import skew, kurtosis # noqa:F401 + from scipy import kurtosis, skew # noqa:F401 assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index ec4162f87010f..7bb1d98086a91 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -59,6 +59,7 @@ def test_reindex_with_same_tz(self): def test_time_loc(self): # GH8667 from datetime import time + from pandas._libs.index import _SIZE_CUTOFF ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index be193e0854d8d..d8e56661b7d61 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -15,7 +15,7 @@ def test_multiindex_get_loc(): # GH7724, GH2646 with warnings.catch_warnings(record=True): # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn + from numpy.random import choice, randint, randn cols = ["jim", "joe", "jolie", "joline", "jolia"] diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..d64e2d1933ace 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -37,8 +37,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4d93119ffa3f5..eacf4fa08545d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -11,8 +11,7 @@ @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state @@ -37,8 +36,7 @@ def open(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( @@ -76,8 +74,7 @@ def mock_get_filepath_or_buffer(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0991fae39138e..29b787d39c09d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -48,10 +48,10 @@ try: import sqlalchemy - import sqlalchemy.schema - import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + import sqlalchemy.schema + import sqlalchemy.sql.sqltypes as sqltypes SQLALCHEMY_INSTALLED = True except ImportError: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 896d3278cdde1..3b1ff233c5ec1 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -13,7 +13,6 @@ from pandas import DataFrame, Series import pandas._testing as tm - """ This is a common base class used for various plotting tests """ @@ -24,6 +23,7 @@ class TestPlotBase: def setup_method(self, method): import matplotlib as mpl + from pandas.plotting._matplotlib import compat mpl.rcdefaults() @@ -187,8 +187,8 @@ def _check_colors( Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ + from matplotlib.collections import Collection, LineCollection, PolyCollection from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection, LineCollection conv = self.colorconverter if linecolors is not None: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 317a994bd9a32..ee43e5d7072fe 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2408,8 +2408,8 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_default_color_cycle(self): - import matplotlib.pyplot as plt import cycler + import matplotlib.pyplot as plt colors = list("rgbk") plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) @@ -2953,8 +2953,8 @@ def _check(axes): @td.skip_if_no_scipy def test_memory_leak(self): """ Check that every plot type gets properly collected. """ - import weakref import gc + import weakref results = {} for kind in plotting.PlotAccessor._all_kinds: @@ -3032,8 +3032,8 @@ def test_df_subplots_patterns_minorticks(self): @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) @@ -3422,9 +3422,9 @@ def test_xlabel_ylabel_dataframe_subplots( def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.gridspec # noqa + import matplotlib.pyplot as plt gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index b6a6c326c3df3..34c881855d16a 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -101,7 +101,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -352,6 +352,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle + from pandas.plotting._matplotlib.hist import _grouped_hist df = DataFrame(randn(500, 2), columns=["A", "B"]) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 75eeede472fe9..f5c1c58f3f7ed 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -131,9 +131,10 @@ def test_scatter_matrix_axis(self): @pytest.mark.slow def test_andrews_curves(self, iris): - from pandas.plotting import andrews_curves from matplotlib import cm + from pandas.plotting import andrews_curves + df = iris _check_plot_works(andrews_curves, frame=df, class_column="Name") @@ -206,9 +207,10 @@ def test_andrews_curves(self, iris): @pytest.mark.slow def test_parallel_coordinates(self, iris): - from pandas.plotting import parallel_coordinates from matplotlib import cm + from pandas.plotting import parallel_coordinates + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") @@ -279,9 +281,10 @@ def test_parallel_coordinates_with_sorted_labels(self): @pytest.mark.slow def test_radviz(self, iris): - from pandas.plotting import radviz from matplotlib import cm + from pandas.plotting import radviz + df = iris _check_plot_works(radviz, frame=df, class_column="Name") @@ -397,6 +400,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. from matplotlib import cm + from pandas.plotting._matplotlib.style import _get_standard_colors color_before = cm.gnuplot(range(5)) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 151bb3bed7207..cc00626e992f3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -452,7 +452,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -827,6 +827,7 @@ def test_standard_colors(self): @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors + from pandas.plotting._matplotlib.style import _get_standard_colors # multiple colors like mediumaquamarine diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 0b34fab7b80b1..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ @@ -166,6 +165,7 @@ def test_getitem_setitem_datetime_tz_pytz(): def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz tz = ( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 19caf4eccf748..4b4ef5ea046be 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -90,7 +90,7 @@ def test_with_nan(self): tm.assert_series_equal(result, expected) def test_periodindex(self): - from pandas import period_range, PeriodIndex + from pandas import PeriodIndex, period_range # array or list or dates N = 50 diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5c8a0d224c4f9..ef2bafd4ea2ad 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -195,8 +195,8 @@ def test_add_with_duplicate_index(self): tm.assert_series_equal(result, expected) def test_add_na_handling(self): - from decimal import Decimal from datetime import date + from decimal import Decimal s = Series( [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e718a6b759963..b32c5e91af295 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -90,7 +90,7 @@ def test_statsmodels(): def test_scikit_learn(df): sklearn = import_module("sklearn") # noqa - from sklearn import svm, datasets + from sklearn import datasets, svm digits = datasets.load_digits() clf = svm.SVC(gamma=0.001, C=100.0) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index f413490764124..3a8a1a3144269 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -53,8 +53,8 @@ def plot(self, left, right, labels=None, vertical: bool = True): vertical : bool, default True If True, use vertical layout. If False, use horizontal layout. """ - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt if not isinstance(left, list): left = [left] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7bf3df176b378..c0dd77cd73ddc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 -isort==4.3.21 +isort>=5.2.1 mypy==0.730 pycodestyle gitpython From b996b86d227afb8543bee4d8e3c7261608dbcad5 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 4 Aug 2020 02:00:49 -0700 Subject: [PATCH 05/83] Backport PR #35467: CI: activate github actions on 1.1.x (PR only) (#35535) Co-authored-by: Simon Hawkins --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db1fc30111a2d..149acef72db26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,9 @@ on: push: branches: master pull_request: - branches: master + branches: + - master + - 1.1.x env: ENV_FILE: environment.yml From 640db0b3cb471dc80e147dc172f15e713fdf2403 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 4 Aug 2020 02:01:46 -0700 Subject: [PATCH 06/83] Backport PR #35468: CI: activate azure pipelines on 1.1.x (#35536) Co-authored-by: Simon Hawkins --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e45cafc02cb61..113ad3e338952 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,9 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master +- 1.1.x pr: - master +- 1.1.x variables: PYTEST_WORKERS: auto From 69165d12644768c7a22b7ae798e34ca6ce4f28e1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 4 Aug 2020 02:53:41 -0700 Subject: [PATCH 07/83] Backport PR #35502: CI: xfail numpy-dev (#35537) Co-authored-by: Simon Hawkins --- pandas/tests/indexes/common.py | 12 +++++++++++- pandas/tests/indexing/test_loc.py | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c8b780455f862..f5b9f4a401e60 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,6 +5,7 @@ import pytest from pandas._libs import iNaT +from pandas.compat.numpy import _is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -417,7 +418,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index): + def test_intersection_base(self, index, request): if isinstance(index, CategoricalIndex): return @@ -434,6 +435,15 @@ def test_intersection_base(self, index): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: + # https://github.com/pandas-dev/pandas/issues/35481 + if ( + _is_numpy_dev + and isinstance(case, Series) + and isinstance(index, UInt64Index) + ): + mark = pytest.mark.xfail(reason="gh-35481") + request.node.add_marker(mark) + result = first.intersection(case) assert tm.equalContents(result, second) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 30b13b6ea9fce..193800fae751f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _is_numpy_dev + import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range import pandas._testing as tm @@ -945,6 +947,7 @@ def test_loc_setitem_empty_append(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe From 1ab59a83adf9c50a9fabdb609c7a2ff369a30e6b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 4 Aug 2020 03:37:59 -0700 Subject: [PATCH 08/83] Backport PR #35440: BUG: CategoricalIndex.format (#35539) Co-authored-by: Terji Petersen --- doc/source/whatsnew/v1.1.1.rst | 7 +++++++ pandas/core/indexes/category.py | 12 ++++++------ pandas/core/indexes/range.py | 7 +------ pandas/tests/indexes/categorical/test_category.py | 6 ++++++ pandas/tests/indexes/common.py | 6 ++++++ pandas/tests/indexes/datetimes/test_datetimelike.py | 6 ++++++ pandas/tests/indexes/test_base.py | 7 +++++-- pandas/tests/indexes/test_numeric.py | 7 +++++++ pandas/tests/io/formats/test_format.py | 9 +++++++++ 9 files changed, 53 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 443589308ad4c..815ce2c4c2905 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,13 @@ Fixed regressions Bug fixes ~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) + + **Datetimelike** - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b0b008de69a94..74b235655e345 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -348,12 +348,12 @@ def _format_attrs(self): return attrs def _format_with_header(self, header, na_rep="NaN") -> List[str]: - from pandas.io.formats.format import format_array + from pandas.io.formats.printing import pprint_thing - formatted_values = format_array( - self._values, formatter=None, na_rep=na_rep, justify="left" - ) - result = ibase.trim_front(formatted_values) + result = [ + pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep + for x in self._values + ] return header + result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e5e98039ff77b..eee610681087d 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional +from typing import Any, Optional import warnings import numpy as np @@ -33,8 +33,6 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.io.formats.printing import pprint_thing - _empty_range = range(0) @@ -197,9 +195,6 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + [pprint_thing(x) for x in self._range] - # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7f30a77872bc1..8af26eef504fc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -478,3 +478,9 @@ def test_reindex_base(self): def test_map_str(self): # See test_map.py pass + + def test_format_different_scalar_lengths(self): + # GH35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index f5b9f4a401e60..3b41c4bfacf73 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -642,6 +642,12 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [str(x) for x in idx] + assert idx.format() == expected + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 7345ae3032463..a5abf2946feda 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -20,6 +20,12 @@ def index(self, request): def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [f"{x:%Y-%m-%d}" for x in idx] + assert idx.format() == expected + def test_shift(self): pass # handled in test_ops diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index eaf48421dc071..59ee88117a984 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1171,8 +1171,11 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - def test_format(self, index): - self._check_method_works(Index.format, index) + def test_format_different_scalar_lengths(self): + # GH35439 + idx = Index(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected def test_format_bug(self): # GH 14626 diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a7c5734ef9b02..bfcac5d433d2c 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -21,6 +21,13 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False + def test_format(self): + # GH35439 + idx = self.create_index() + max_width = max(len(str(x)) for x in idx) + expected = [str(x).ljust(max_width) for x in idx] + assert idx.format() == expected + def test_numeric_compat(self): pass # override Base method diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e236b3da73c69..84805d06df4a8 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2141,6 +2141,15 @@ def test_dict_entries(self): assert "'a': 1" in val assert "'b': 2" in val + def test_categorical_columns(self): + # GH35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = pd.DataFrame(data, columns=cols) + df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + def test_period(self): # GH 12615 df = pd.DataFrame( From bea9af893aee1d585a6387b5724d92ecb3283d1a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 4 Aug 2020 03:56:32 -0700 Subject: [PATCH 09/83] Backport PR #35510: REGR: Check for float in isnaobj_old (#35540) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/_libs/missing.pyx | 5 ++++- pandas/tests/io/parser/test_common.py | 12 +++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 815ce2c4c2905..6a327a4fc732f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - - diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 760fab3781fd4..771e8053ac9be 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -155,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = checknull(val) or val == INF or val == NEGINF + result[i] = ( + checknull(val) + or util.is_float_object(val) and (val == INF or val == NEGINF) + ) return result.view(np.bool_) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 12e73bae40eac..5154a9ba6fdf0 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -18,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -2179,3 +2179,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser = all_parsers with pytest.raises(ValueError, match="Names should be an ordered collection."): parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) From a48cc4d668c565caf6d2bb916fc627e5c08b9aa1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 6 Aug 2020 14:50:02 -0700 Subject: [PATCH 10/83] Backport PR #35513: BUG: RollingGroupby respects __getitem__ (#35591) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.1.rst | 3 +-- pandas/core/window/rolling.py | 4 ++++ pandas/tests/window/test_grouper.py | 25 +++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 6a327a4fc732f..5e36bfe6b6307 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -16,8 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). -- -- +- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 445f179248226..87bcaa7d9512f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2220,6 +2220,10 @@ def _apply( def _constructor(self): return Rolling + @cache_readonly + def _selected_obj(self): + return self._groupby._selected_obj + def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 744ca264e91d9..ca5a9eccea4f5 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -214,3 +214,28 @@ def foo(x): name="value", ) tm.assert_series_equal(result, expected) + + def test_groupby_subselect_rolling(self): + # GH 35486 + df = DataFrame( + {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]} + ) + result = df.groupby("a")[["b"]].rolling(2).max() + expected = DataFrame( + [np.nan, np.nan, 2.0, np.nan], + columns=["b"], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].rolling(2).max() + expected = Series( + [np.nan, np.nan, 2.0, np.nan], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + name="b", + ) + tm.assert_series_equal(result, expected) From d8cdcf0d55be811adacdd3fe7f450a5f82dfccb9 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 6 Aug 2020 16:19:30 -0700 Subject: [PATCH 11/83] Backport PR #35554: DOC: Document that read_hdf can use pickle (#35593) Co-authored-by: Tom Augspurger --- doc/source/user_guide/io.rst | 9 ++++--- pandas/io/pytables.py | 51 +++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..cc42f952b1733 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3441,10 +3441,11 @@ for some advanced strategies .. warning:: - pandas requires ``PyTables`` >= 3.0.0. - There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. - If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. - Stores created previously will need to be rewritten using the updated version. + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle. Loading pickled data received from + untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e0df4c29e543e..6497067e3930c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -289,7 +289,15 @@ def read_hdf( Read from the store, close it if we opened it. Retrieve pandas object stored in file, optionally based on where - criteria + criteria. + + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. Parameters ---------- @@ -445,6 +453,14 @@ class HDFStore: Either Fixed or Table format. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- path : str @@ -789,6 +805,14 @@ def select( """ Retrieve pandas object stored in file, optionally based on where criteria. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -852,6 +876,15 @@ def select_as_coordinates( """ return the selection as an Index + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + + Parameters ---------- key : str @@ -876,6 +909,14 @@ def select_column( return a single column from the table. This is generally only useful to select an indexable + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -912,6 +953,14 @@ def select_as_multiple( """ Retrieve pandas objects from multiple tables. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- keys : a list of the tables From 5904807a6cc1d2ccccd6d326ac2dd5d768f7f24e Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Aug 2020 01:32:54 -0700 Subject: [PATCH 12/83] Backport PR #35590: BUG: validate index/data length match in DataFrame construction (#35597) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.1.rst | 4 ++++ pandas/core/internals/blocks.py | 3 --- pandas/core/internals/managers.py | 2 +- pandas/tests/frame/test_constructors.py | 6 ++++++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 5e36bfe6b6307..7db609fba5d68 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -50,6 +50,10 @@ Categorical - +**DataFrame** +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) +- + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ca6eca1ff829..f4f4a3666a84e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -105,7 +105,6 @@ class Block(PandasObject): is_extension = False _can_hold_na = False _can_consolidate = True - _verify_integrity = True _validate_ndim = True @classmethod @@ -1525,7 +1524,6 @@ class ExtensionBlock(Block): """ _can_consolidate = False - _verify_integrity = False _validate_ndim = False is_extension = True @@ -2613,7 +2611,6 @@ def _replace_coerce( class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True - _verify_integrity = True _can_hold_na = True should_store = Block.should_store diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..0ce2408eb003e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -312,7 +312,7 @@ def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a4ed548264d39..b78bb1c492ef4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2619,6 +2619,12 @@ class DatetimeSubclass(datetime): data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) assert data.datetime.dtype == "datetime64[ns]" + def test_with_mismatched_index_length_raises(self): + # GH#33437 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + with pytest.raises(ValueError, match="Shape of passed values"): + DataFrame(dti, index=range(4)) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): From 33ec020c8d5cf61eeba6b2712d4edbe83df9708d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Aug 2020 05:29:44 -0700 Subject: [PATCH 13/83] Backport PR #35547: Bug fix one element series truncate (#35600) Co-authored-by: gabicca <33315687+gabicca@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/core/generic.py | 2 +- pandas/tests/series/methods/test_truncate.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 7db609fba5d68..45f1015a8e7bd 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -48,7 +48,7 @@ Categorical **Indexing** -- +- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) **DataFrame** - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..a11ee6b5d9846 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9397,7 +9397,7 @@ def truncate( if before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 7c82edbaec177..45592f8d99b93 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -141,3 +141,14 @@ def test_truncate_multiindex(self): expected = df.col tm.assert_series_equal(result, expected) + + def test_truncate_one_element_series(self): + # GH 35544 + series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + before = pd.Timestamp("2020-08-02") + after = pd.Timestamp("2020-08-04") + + result = series.truncate(before=before, after=after) + + # the input Series and the expected Series are the same + tm.assert_series_equal(result, series) From 3af8044bdfd9c8aeadab848c192493ede2d1e022 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Aug 2020 05:52:47 -0700 Subject: [PATCH 14/83] Backport PR #35582: BUG: to_timedelta fails on Int64 Series with null values (#35602) Co-authored-by: Eric Goddard --- doc/source/whatsnew/v1.1.1.rst | 5 +++++ pandas/core/arrays/timedeltas.py | 4 +++- pandas/tests/tools/test_to_timedelta.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 45f1015a8e7bd..d3d96c41483d3 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -37,6 +37,11 @@ Categorical - - +**Timedelta** + +- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) + + **Numeric** - diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a378423df788b..a30e1060c64f1 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -29,7 +29,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr -from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.core.construction import extract_array @@ -921,6 +921,8 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data + elif isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=tslibs.iNaT) # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 1e193f22a6698..f68d83f7f4d58 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -166,3 +166,16 @@ def test_to_timedelta_ignore_strings_unit(self): arr = np.array([1, 2, "error"], dtype=object) result = pd.to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) + + def test_to_timedelta_nullable_int64_dtype(self): + # GH 35574 + expected = Series([timedelta(days=1), timedelta(days=2)]) + result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) + + # IntegerArray Series with nulls + expected = Series([timedelta(days=1), None]) + result = to_timedelta(Series([1, None], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) From 3b006521147ac14faf91451cbe90540ef14d63b2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Aug 2020 06:34:24 -0700 Subject: [PATCH 15/83] Backport PR #35578: BUG: df.shift(n, axis=1) with multiple blocks (#35601) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/internals/managers.py | 25 ++++++++++++++++++++-- pandas/tests/frame/methods/test_shift.py | 27 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index d3d96c41483d3..6b315e0a9d016 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0ce2408eb003e..4693cc193c27c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -551,6 +551,24 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + ncols = self.shape[0] + if periods > 0: + indexer = [-1] * periods + list(range(ncols - periods)) + else: + nper = abs(periods) + indexer = list(range(nper, ncols)) + [-1] * nper + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": @@ -1213,6 +1231,7 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + consolidate: bool = True, ) -> T: """ Parameters @@ -1223,7 +1242,8 @@ def reindex_indexer( fill_value : object, default None allow_dups : bool, default False copy : bool, default True - + consolidate: bool, default True + Whether to consolidate inplace before reindexing. pandas-indexer with -1's only. """ @@ -1236,7 +1256,8 @@ def reindex_indexer( result.axes[axis] = new_axis return result - self._consolidate_inplace() + if consolidate: + self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 9ec029a6c4304..8f6902eca816f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -145,6 +145,33 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) + def test_shift_axis1_multiple_blocks(self): + # GH#35488 + df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + + result = df3.shift(2, axis=1) + + expected = df3.take([-1, -1, 0, 1, 2], axis=1) + expected.iloc[:, :2] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + # rebuild df3 because `take` call above consolidated + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + result = df3.shift(-2, axis=1) + + expected = df3.take([2, 3, 4, -1, -1], axis=1) + expected.iloc[:, -2:] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced From c1676ce796a6e40d57589d0881158f3da48817d6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 7 Aug 2020 12:00:46 -0700 Subject: [PATCH 16/83] Backport PR #35562: BUG: Ensure rolling groupby doesn't segfault with center=True (#35610) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 6 +++ pandas/tests/window/test_grouper.py | 65 +++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 6b315e0a9d016..a044a4aab284e 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..bc36bdca982e8 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -319,4 +319,10 @@ def get_window_bounds( end_arrays.append(window_indicies.take(end)) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index ca5a9eccea4f5..5241b9548a442 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -215,6 +215,71 @@ def foo(x): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_center_center(self): + # GH 35552 + series = Series(range(1, 6)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 5, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), + ) + tm.assert_series_equal(result, expected) + + series = Series(range(1, 5)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), + ) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ("b", 10), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( From 6f186f77d45ba82b771764fe68cab7c6b0181891 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 8 Aug 2020 02:21:18 -0700 Subject: [PATCH 17/83] Backport PR #35588: BUG: fix styler cell_ids arg so that blank style is ignored on False (#35619) Co-authored-by: attack68 <24256554+attack68@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/io/formats/style.py | 2 +- pandas/tests/io/formats/test_style.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index a044a4aab284e..ade88a6127014 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -27,6 +27,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index fd1efa2d1b668..584f42a6cab12 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,7 +390,7 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 9c6910637fa7e..3ef5157655e78 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1682,6 +1682,12 @@ def f(a, b, styler): result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) + def test_no_cell_ids(self): + # GH 35588 + df = pd.DataFrame(data=[[0]]) + s = Styler(df, uuid="_", cell_ids=False).render() + assert s.find('') != -1 + @td.skip_if_no_mpl class TestStylerMatplotlibDep: From c5ded4d6d7609df602f1664c2c35d11b1e29512a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 8 Aug 2020 02:46:38 -0700 Subject: [PATCH 18/83] Backport PR #35473: REGR: Fix conversion of mixed dtype DataFrame to numpy str (#35617) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 ++ pandas/core/internals/managers.py | 2 ++ pandas/tests/frame/test_api.py | 7 +++++++ 4 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ade88a6127014..f0ad9d1ca3b0f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e4c9393f74de..b7286ce86d24e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1371,6 +1371,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4693cc193c27c..e6e2b06e1873e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -867,6 +867,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..cc57a3970d18b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -367,6 +367,13 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) + def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) From 396ba93e629763e0d1c13dd5385f11eedb048dce Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 8 Aug 2020 05:23:06 -0700 Subject: [PATCH 19/83] Backport PR #35621: CI: Linux py36_locale failures with pytest DeprecationWarning (#35623) Co-authored-by: Simon Hawkins --- ci/deps/azure-36-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index a9b9a5a47ccf5..3034ed3dc43af 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 From a057e74f3fe4db176ba39db0e86973156fd95ab1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 10 Aug 2020 07:48:33 -0700 Subject: [PATCH 20/83] Backport PR #35639: BUG: RollingGroupby with closed and column selection no longer raises ValueError (#35651) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.1.rst | 4 +++ pandas/core/window/common.py | 2 +- pandas/core/window/rolling.py | 10 ++---- pandas/tests/window/test_grouper.py | 51 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index f0ad9d1ca3b0f..7f5182e3eaa6f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -51,6 +51,10 @@ Categorical - - +**Groupby/resample/rolling** + +- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) + **Plotting** - diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..51a067427e867 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -52,7 +52,7 @@ def __init__(self, obj, *args, **kwargs): kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: - groupby, obj = obj, obj.obj + groupby, obj = obj, obj._selected_obj self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 87bcaa7d9512f..ea03a7f2f8162 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2196,7 +2196,7 @@ def _apply( # Cannot use _wrap_outputs because we calculate the result all at once # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index + grouped_object_index = self.obj.index grouped_index_name = [grouped_object_index.name] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name @@ -2220,10 +2220,6 @@ def _apply( def _constructor(self): return Rolling - @cache_readonly - def _selected_obj(self): - return self._groupby._selected_obj - def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. @@ -2262,7 +2258,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] if self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 + index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2279,7 +2275,7 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried thru to the selected obj # when we do the splitting for the groupby if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) + self.obj = self.obj.set_index(self._on) self.on = None return super()._gotitem(key, ndim, subset=subset) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 5241b9548a442..e1dcac06c39cc 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -304,3 +304,54 @@ def test_groupby_subselect_rolling(self): name="b", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + result = ( + df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_subset_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + + result = ( + df.groupby("group")[["column1", "date"]] + .rolling("1D", on="date", closed="left")["column1"] + .sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) From 7957254a838f6ec9de4ed1866270df82fdc8182d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 10 Aug 2020 07:53:32 -0700 Subject: [PATCH 21/83] =?UTF-8?q?Backport=20PR=20#35522:=20BUG:=20Fix=20as?= =?UTF-8?q?sert=5Fequal=20when=20check=5Fexact=3DTrue=20for=20non-numeric?= =?UTF-8?q?=20dtypes=20#3=E2=80=A6=20(#35652)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Isaac Virshup --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/_testing.py | 6 ++---- pandas/tests/util/test_assert_series_equal.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 7f5182e3eaa6f..e5860644fa371 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/_testing.py b/pandas/_testing.py index a020fbff3553a..713f29466f097 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1339,10 +1339,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1284cc9d4f49b..a7b5aeac560e4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -281,3 +281,18 @@ class MySeries(Series): with pytest.raises(AssertionError, match="Series classes are different"): tm.assert_series_equal(s3, s1, check_series_type=True) + + +def test_series_equal_exact_for_nonnumeric(): + # https://github.com/pandas-dev/pandas/issues/35446 + s1 = Series(["a", "b"]) + s2 = Series(["a", "b"]) + s3 = Series(["b", "a"]) + + tm.assert_series_equal(s1, s2, check_exact=True) + tm.assert_series_equal(s2, s1, check_exact=True) + + with pytest.raises(AssertionError): + tm.assert_series_equal(s1, s3, check_exact=True) + with pytest.raises(AssertionError): + tm.assert_series_equal(s3, s1, check_exact=True) From ac400436225fe777b653ca242944a12611de0efa Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 11 Aug 2020 02:41:02 -0700 Subject: [PATCH 22/83] Backport PR #35633: BUG: DataFrame.apply with func altering row in-place (#35666) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/apply.py | 2 ++ pandas/tests/frame/apply/test_frame_apply.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index e5860644fa371..415f9e508feb8 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) +- Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6b8d7dc35fe95..6d44cf917a07a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -389,6 +389,8 @@ def series_generator(self): blk = mgr.blocks[0] for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr blk.values = arr ser.name = name yield ser diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 3a32278e2a4b1..538978358c8e7 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1522,3 +1522,22 @@ def test_apply_dtype(self, col): expected = df.dtypes tm.assert_series_equal(result, expected) + + +def test_apply_mutating(): + # GH#35462 case where applied func pins a new BlockManager to a row + df = pd.DataFrame({"a": range(100), "b": range(100, 200)}) + + def func(row): + mgr = row._mgr + row.loc["a"] += 1 + assert row._mgr is not mgr + return row + + expected = df.copy() + expected["a"] += 1 + + result = df.apply(func, axis=1) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, result) From 273f41600acbe03f05b870f3eeb4ce726b690504 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 13 Aug 2020 03:14:50 -0700 Subject: [PATCH 23/83] Backport PR #35647: BUG: Support custom BaseIndexers in groupby.rolling (#35699) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 14 ++++++++++---- pandas/core/window/rolling.py | 15 +++++++++++---- pandas/tests/window/test_grouper.py | 23 +++++++++++++++++++++++ 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 415f9e508feb8..cdc244ca193b4 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index bc36bdca982e8..7cbe34cdebf9f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,6 +1,6 @@ """Indexer objects for computing start/end window bounds for rolling operations""" from datetime import timedelta -from typing import Dict, Optional, Tuple, Type, Union +from typing import Dict, Optional, Tuple, Type import numpy as np @@ -265,7 +265,8 @@ def __init__( index_array: Optional[np.ndarray], window_size: int, groupby_indicies: Dict, - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]], + rolling_indexer: Type[BaseIndexer], + indexer_kwargs: Optional[Dict], **kwargs, ): """ @@ -276,7 +277,10 @@ def __init__( """ self.groupby_indicies = groupby_indicies self.rolling_indexer = rolling_indexer - super().__init__(index_array, window_size, **kwargs) + self.indexer_kwargs = indexer_kwargs or {} + super().__init__( + index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs + ) @Appender(get_window_bounds_doc) def get_window_bounds( @@ -298,7 +302,9 @@ def get_window_bounds( else: index_array = self.index_array indexer = self.rolling_indexer( - index_array=index_array, window_size=self.window_size, + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( len(indicies), min_periods, center, closed diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ea03a7f2f8162..d727881f8285a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -145,7 +145,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -2255,10 +2255,16 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: ------- GroupbyRollingIndexer """ - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] - if self.is_freq_type: + rolling_indexer: Type[BaseIndexer] + indexer_kwargs: Optional[Dict] = None + index_array = self.obj.index.asi8 + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__ + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + elif self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2267,6 +2273,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: window_size=window, groupby_indicies=self._groupby.indices, rolling_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, ) return window_indexer diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index e1dcac06c39cc..a9590c7e1233a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -305,6 +305,29 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_custom_indexer(self): + # GH 35557 + class SimpleIndexer(pd.api.indexers.BaseIndexer): + def get_window_bounds( + self, num_values=0, min_periods=None, center=None, closed=None + ): + min_periods = self.window_size if min_periods is None else 0 + end = np.arange(num_values, dtype=np.int64) + 1 + start = end.copy() - self.window_size + start[start < 0] = min_periods + return start, end + + df = pd.DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5 + ) + result = ( + df.groupby(df.index) + .rolling(SimpleIndexer(window_size=3), min_periods=1) + .sum() + ) + expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() + tm.assert_frame_equal(result, expected) + def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( From 48e812b212317b40f51c427099c424d59643298d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 13 Aug 2020 04:04:50 -0700 Subject: [PATCH 24/83] Backport PR #35654: BUG: GH-35558 merge_asof tolerance error (#35702) Co-authored-by: Yutaro Ikeda --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 22 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index cdc244ca193b4..b37103910afab 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 27b331babe692..2349cb1dcc0c7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1667,7 +1667,7 @@ def _get_merge_keys(self): msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lk.dtype)}" + f"with type {repr(lt.dtype)}" ) if needs_i8_conversion(lt): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 9b09f0033715d..895de2b748c34 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1339,3 +1339,25 @@ def test_merge_index_column_tz(self): index=pd.Index([0, 1, 2, 3, 4]), ) tm.assert_frame_equal(result, expected) + + def test_left_index_right_index_tolerance(self): + # https://github.com/pandas-dev/pandas/issues/35558 + dr1 = pd.date_range( + start="1/1/2020", end="1/20/2020", freq="2D" + ) + pd.Timedelta(seconds=0.4) + dr2 = pd.date_range(start="1/1/2020", end="2/1/2020") + + df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1)) + df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2)) + + expected = pd.DataFrame( + {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1) + ) + result = pd.merge_asof( + df1, + df2, + left_index=True, + right_index=True, + tolerance=pd.Timedelta(seconds=0.5), + ) + tm.assert_frame_equal(result, expected) From e8a8264c49b7e2e3167cf989335a8b36df9672d0 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 14 Aug 2020 04:15:47 -0700 Subject: [PATCH 25/83] Backport PR #35673: REGR: Dataframe.reset_index() on empty DataFrame with MI and datatime level (#35716) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 +- .../tests/frame/methods/test_reset_index.py | 30 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b37103910afab..98d67e930ccc0 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b7286ce86d24e..041121d60ad33 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4778,7 +4778,7 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's - if mask.all(): + if mask.size > 0 and mask.all(): dtype = index.dtype fill_value = na_value_for_dtype(dtype) values = construct_1d_arraylike_from_scalar( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index da4bfa9be4881..b88ef0e6691cb 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -318,3 +318,33 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): result = DataFrame(index=idx)[:0].reset_index().dtypes expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + idx = MultiIndex( + levels=[[pd.Timestamp("2020-07-20 00:00:00")], [3, 4]], + codes=[[], []], + names=["a", "b"], + ) + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): + # https://github.com/pandas-dev/pandas/issues/35657 + df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01"))) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + tm.assert_frame_equal(result, expected) From f15a60a106842d4defa02d40d5fb3176ed55ce75 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 14 Aug 2020 13:12:39 +0100 Subject: [PATCH 26/83] CI: doctest failure for read_hdf on 1.1.x (fixed in #35214 on master) (#35718) --- pandas/io/pytables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6497067e3930c..ac5d32c60dc5e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -320,6 +320,10 @@ def read_hdf( mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. where : list, optional A list of Term (or convertible) objects. start : int, optional @@ -332,10 +336,6 @@ def read_hdf( Return an iterator object. chunksize : int, optional Number of rows to include in an iteration when using an iterator. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. **kwargs Additional keyword arguments passed to HDFStore. From a0a65492e3b4bed74e1b31484a131266f9396ed6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 14 Aug 2020 08:38:20 -0700 Subject: [PATCH 27/83] Backport PR #35707: REGR: fix DataFrame.diff with read-only data (#35721) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.1.rst | 3 ++- pandas/_libs/algos.pyx | 7 ++++--- pandas/tests/frame/methods/test_diff.py | 9 +++++++++ setup.py | 3 +++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 98d67e930ccc0..22d34bef65aa9 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -16,10 +16,11 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) -- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7e90a8cc681ef..0a70afda893cf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1200,14 +1200,15 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - diff_t[:, :] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.is_f_contig() + bint f_contig = arr.flags.f_contiguous + # bint f_contig = arr.is_f_contig() # TODO(cython 3) # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 45f134a93a23a..0486fb2d588b6 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -214,3 +214,12 @@ def test_diff_integer_na(self, axis, expected): # Test case for default behaviour of diff result = df.diff(axis=axis) tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.randn(5, 2) + arr.flags.writeable = False + df = pd.DataFrame(arr) + result = df.diff() + expected = pd.DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) diff --git a/setup.py b/setup.py index aebbdbf4d1e96..22da02360619e 100755 --- a/setup.py +++ b/setup.py @@ -457,6 +457,9 @@ def run(self): if sys.version_info[:2] == (3, 8): # GH 33239 extra_compile_args.append("-Wno-error=deprecated-declarations") + # https://github.com/pandas-dev/pandas/issues/35559 + extra_compile_args.append("-Wno-error=unreachable-code") + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled From 3ef3617c2e784b98db83ab437e7607459f68a1cc Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 14 Aug 2020 08:39:00 -0700 Subject: [PATCH 28/83] Backport PR #35664: BUG: Styler cell_ids fails on multiple renders (#35722) Co-authored-by: attack68 <24256554+attack68@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/io/formats/style.py | 14 +++++++------- pandas/tests/io/formats/test_style.py | 5 ++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 22d34bef65aa9..85e2a335c55c6 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -34,7 +34,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 584f42a6cab12..3bbb5271bce61 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,16 +390,16 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style + props = [] if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(tuple(x.split(":"))) + else: + props.append(("", "")) row_es.append(row_dict) - props = [] - for x in ctx[r, c]: - # have to handle empty styles like [''] - if x.count(":"): - props.append(tuple(x.split(":"))) - else: - props.append(("", "")) cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 3ef5157655e78..6025649e9dbec 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1684,8 +1684,11 @@ def f(a, b, styler): def test_no_cell_ids(self): # GH 35588 + # GH 35663 df = pd.DataFrame(data=[[0]]) - s = Styler(df, uuid="_", cell_ids=False).render() + styler = Styler(df, uuid="_", cell_ids=False) + styler.render() + s = styler.render() # render twice to ensure ctx is not updated assert s.find('') != -1 From c5031fab696e45917672c75013e81289f70a9635 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 15 Aug 2020 09:59:36 -0700 Subject: [PATCH 29/83] Backport PR #35723: agg with list of non-aggregating functions (#35738) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/groupby/generic.py | 25 +++++++++++-------- pandas/core/groupby/groupby.py | 10 +++++--- .../tests/groupby/aggregate/test_aggregate.py | 13 ++++++++++ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 85e2a335c55c6..565b4a014bd0c 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c50b753cf3293..f5858c5c54f1d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -322,11 +322,14 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - output = self._wrap_aggregated_output(results) + output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +338,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -363,8 +366,11 @@ def _wrap_series_output( return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -383,9 +389,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -1714,7 +1718,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1739,8 +1745,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac45222625569..11d0c8e42f745 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -973,7 +973,9 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -1048,7 +1050,7 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -1101,7 +1103,7 @@ def _python_agg_general( output[key] = maybe_cast_result(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2521,7 +2523,7 @@ def _get_cythonized_result( raise TypeError(error_msg) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..ce9d4b892d775 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,16 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) From a49bfcd3c559e4cca98f8ad0753a5b45a01ce3ad Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 17 Aug 2020 02:39:02 -0700 Subject: [PATCH 30/83] Backport PR #35754: CI: Min Pytest Cov Version/Restrict xdist version (#35761) Co-authored-by: Ali McMaster --- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 548660cabaa67..21b4e86918f3b 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bbd0e2795d7e..287d6877b9810 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 177e0d3f4c0af..2457c04e67759 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -10,7 +10,7 @@ dependencies: - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build + - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - beautifulsoup4 From ac8845b1f907021d9820379e1f545801dff42d65 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 17 Aug 2020 05:21:40 -0700 Subject: [PATCH 31/83] Backport PR #35697: REGR: Don't ignore compiled patterns in replace (#35765) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/internals/managers.py | 23 ++++++++++++++++----- pandas/tests/frame/methods/test_replace.py | 8 +++++++ pandas/tests/series/methods/test_replace.py | 10 +++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 565b4a014bd0c..d93cd6edb983a 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e6e2b06e1873e..4c3805f812bb0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2,7 +2,17 @@ import itertools import operator import re -from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import ( + DefaultDict, + Dict, + List, + Optional, + Pattern, + Sequence, + Tuple, + TypeVar, + Union, +) import warnings import numpy as np @@ -1922,7 +1932,10 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1933,7 +1946,7 @@ def _compare_or_regex_search( Parameters ---------- a : array_like - b : scalar + b : scalar or regex pattern regex : bool, default False mask : array_like or None (default) @@ -1943,7 +1956,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Scalar, + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], ): """ Raises an error if the two arrays (a,b) cannot be compared. @@ -1964,7 +1977,7 @@ def _check_comparison_types( else: op = np.vectorize( lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, str) + if isinstance(x, str) and isinstance(b, (str, Pattern)) else False ) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a3f056dbf9648..8603bff0587b6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1573,3 +1573,11 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = pd.DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = pd.DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 11802c59a29da..f78a28c66e946 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -415,3 +417,11 @@ def test_replace_extension_other(self): # https://github.com/pandas-dev/pandas/issues/34530 ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) ser.replace("", "") # no exception + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + s = pd.Series(["a", "b", "c"]) + regex = re.compile("^a$") + result = s.replace({regex: "z"}, regex=True) + expected = pd.Series(["z", "b", "c"]) + tm.assert_series_equal(result, expected) From 66d08dc7447099d0e5e8f1cb1c37b28f6c5c19d2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 17 Aug 2020 08:32:35 -0700 Subject: [PATCH 32/83] Backport PR #35519: REF: StringArray._from_sequence, use less memory (#35770) Co-authored-by: Terji Petersen --- asv_bench/benchmarks/strings.py | 15 +++++++ doc/source/whatsnew/v1.1.1.rst | 5 +++ pandas/_libs/lib.pyx | 51 ++++++++++++++-------- pandas/core/arrays/string_.py | 25 +++-------- pandas/core/dtypes/cast.py | 16 ++----- pandas/tests/arrays/string_/test_string.py | 14 +++--- 6 files changed, 73 insertions(+), 53 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..2023858181baa 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,21 @@ from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.data = tm.rands_array(nchars=10 ** 5, size=10) + + def time_construction(self, dtype): + Series(self.data, dtype=dtype) + + def peakmem_construction(self, dtype): + Series(self.data, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index d93cd6edb983a..10bdfdc10c87a 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -75,6 +75,11 @@ Categorical - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) - +**Strings** + +- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5fa91ffee8ea8..eadfcefaac73d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) - - for i in range(n): - arr_i = arr[i] + Py_ssize_t i = 0, n = len(arr) - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + result = np.asarray(arr, dtype="object") + if copy and result is arr: + result = result.copy() - result[i] = arr_i + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fddd3af858f77..a4778869aee24 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -178,11 +178,10 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not skip_validation: + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -201,23 +200,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() - - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like '>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6f9a1a5be4c43..efd5d29ae0717 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -206,12 +206,16 @@ def test_constructor_raises(): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy): - a = np.array(["a", np.nan], dtype=object) - original = a.copy() - result = pd.arrays.StringArray._from_sequence(a, copy=copy) - expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + nan_arr = np.array(["a", np.nan], dtype=object) + na_arr = np.array(["a", pd.NA], dtype=object) + + result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) + expected = pd.arrays.StringArray(na_arr) + tm.assert_extension_array_equal(result, expected) - tm.assert_numpy_array_equal(a, original) + + expected = nan_arr if copy else na_arr + tm.assert_numpy_array_equal(nan_arr, expected) def test_astype_int(): From 56e95ad2f78ee92dae03ca493d7948b5616abb3b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 17 Aug 2020 13:37:00 -0700 Subject: [PATCH 33/83] Backport PR #35750: Pass check_dtype to assert_extension_array_equal (#35773) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/_testing.py | 10 ++++++++-- pandas/tests/util/test_assert_extension_array_equal.py | 9 +++++++++ pandas/tests/util/test_assert_frame_equal.py | 8 ++++++++ pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 10bdfdc10c87a..9c92c803fa677 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -37,6 +37,7 @@ Bug fixes ~~~~~~~~~ - Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). Categorical ^^^^^^^^^^^ diff --git a/pandas/_testing.py b/pandas/_testing.py index 713f29466f097..ef6232fa6d575 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1377,12 +1377,18 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index d9fdf1491c328..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = array([1, 2, 3], dtype="Int64") + right = array([1, 2, 3], dtype=right_dtype) + tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index fe3e1ff906919..3aa3c64923b14 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,3 +260,11 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a7b5aeac560e4..f3c66052b1904 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -296,3 +296,11 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s1, s3, check_exact=True) with pytest.raises(AssertionError): tm.assert_series_equal(s3, s1, check_exact=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.Series([1, 2, 3], dtype="Int64") + right = pd.Series([1, 2, 3], dtype=right_dtype) + tm.assert_series_equal(left, right, check_dtype=False) From 7a5d186722805befc63f16143beaaf6c07592920 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 17 Aug 2020 13:37:25 -0700 Subject: [PATCH 34/83] Backport PR #35543: REGR: Fix interpolation on empty dataframe (#35764) Co-authored-by: sanderland <48946947+sanderland@users.noreply.github.com> --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/tests/frame/methods/test_interpolate.py | 7 +++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 9c92c803fa677..ff5bbccf63ffe 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a11ee6b5d9846..3a386a8df7075 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6799,6 +6799,9 @@ def interpolate( obj = self.T if should_transpose else self + if obj.empty: + return self + if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index ddb5723e7bd3e..3c9d79397e4bd 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -34,6 +34,13 @@ def test_interp_basic(self): expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + expected = df + tm.assert_frame_equal(result, expected) + def test_interp_bad_method(self): df = DataFrame( { From 4fc0330c514b27d48664dbc03b4743159f169789 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 18 Aug 2020 07:12:19 -0700 Subject: [PATCH 35/83] Backport PR #35774: REGR: follow-up to return copy with df.interpolate on empty DataFrame (#35789) Co-authored-by: Simon Hawkins --- pandas/core/generic.py | 2 +- pandas/tests/frame/methods/test_interpolate.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3a386a8df7075..8bd1dbea4696f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6800,7 +6800,7 @@ def interpolate( obj = self.T if should_transpose else self if obj.empty: - return self + return self.copy() if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 3c9d79397e4bd..6b86a13fcf1b9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -38,6 +38,7 @@ def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() + assert result is not df expected = df tm.assert_frame_equal(result, expected) From 92a6c3aee0a4d40061414c612684087b8e7dfda1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 19 Aug 2020 04:34:39 -0700 Subject: [PATCH 36/83] Backport PR #35787: DOC: clean v1.1.1 release notes (#35800) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.1.rst | 60 +++++++--------------------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ff5bbccf63ffe..43ffed273adbc 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_111: -What's new in 1.1.1 (?) ------------------------ +What's new in 1.1.1 (August XX, 2020) +------------------------------------- These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog including other versions of pandas. @@ -15,20 +15,23 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) +- Fixed regression in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) -- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) -- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). +- Fixed regression in ``.groupby(..).rolling(..)`` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) -- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) -- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) .. --------------------------------------------------------------------------- @@ -37,50 +40,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). -- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) - - -**Datetimelike** - -- -- - -**Timedelta** - +- Bug in :class:`~pandas.io.formats.style.Styler` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) - Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) - - -**Numeric** - -- -- - -**Groupby/resample/rolling** - -- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) - -**Plotting** - -- - -**Indexing** - -- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - -**DataFrame** +- Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) -- - -**Strings** - -- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) - .. --------------------------------------------------------------------------- From 827e606a506648a67ba881af55862bc7f713f71b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 19 Aug 2020 07:25:27 -0700 Subject: [PATCH 37/83] Backport PR #35776: Changed 'int' type to 'integer' in to_numeric docstring (#35806) Co-authored-by: edwardkong <33737404+edwardkong@users.noreply.github.com> --- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/tools/numeric.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d8db196e4b92f..1531f7b292365 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -234,7 +234,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'int', 'block'}, default 'int' + kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 41548931f17f8..cff4695603d06 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -40,13 +40,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'int', 'signed', 'unsigned', 'float'}, default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'int' or 'signed': smallest signed int dtype (min.: np.int8) + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) From 6d44a7309350702f7c254e70238036837d45c625 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 20 Aug 2020 06:32:04 -0700 Subject: [PATCH 38/83] Backport PR #35809: CI: more xfail failing 32-bit tests (#35821) Co-authored-by: Simon Hawkins --- pandas/tests/window/test_grouper.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index a9590c7e1233a..d0a62374d0888 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -215,6 +215,7 @@ def foo(x): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_center_center(self): # GH 35552 series = Series(range(1, 6)) @@ -280,6 +281,7 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( @@ -305,6 +307,7 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_custom_indexer(self): # GH 35557 class SimpleIndexer(pd.api.indexers.BaseIndexer): @@ -328,6 +331,7 @@ def get_window_bounds( expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( @@ -352,6 +356,7 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( From 311fcde2b6f710fe1900ce98e037b64ebe2c8d39 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 20 Aug 2020 14:41:49 +0100 Subject: [PATCH 39/83] CI: test_chunks_have_consistent_numerical_type periodically fails on 1.1.x (#35808) --- pandas/tests/io/parser/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5154a9ba6fdf0..3d5f6ae3a4af9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,6 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1151,6 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers From 665e61bd3b5df438305e2404e4740a132887cea7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 20 Aug 2020 09:36:02 -0700 Subject: [PATCH 40/83] Backport PR #35801: DOC: another pass of v1.1.1 release notes (#35822) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.1.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 43ffed273adbc..721f07c865409 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_111: -What's new in 1.1.1 (August XX, 2020) +What's new in 1.1.1 (August 20, 2020) ------------------------------------- These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog @@ -27,7 +27,7 @@ Fixed regressions - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) -- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) @@ -40,11 +40,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :class:`~pandas.io.formats.style.Styler` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :class:`~pandas.io.formats.style.Styler` whereby ``cell_ids`` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) - Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) -- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) +- Bug in :meth:`to_timedelta` fails when ``arg`` is a :class:`Series` with ``Int64`` dtype containing null values (:issue:`35574`) - Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) -- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when ``data`` and ``index`` have mismatched lengths (:issue:`33437`) .. --------------------------------------------------------------------------- From f2ca0a2665b2d169c97de87b8e778dbed86aea07 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Thu, 20 Aug 2020 16:54:47 +0000 Subject: [PATCH 41/83] RLS: 1.1.1 From 39ec5cec4d16d1412d054f450f3af856abe4ea15 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Fri, 21 Aug 2020 09:23:56 +0100 Subject: [PATCH 42/83] MAINT: Manual Backport PR #35825 on branch 1.1.x (#35835) --- ci/deps/travis-36-locale.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 03a1e751b6a86..8f7e29abc5f3b 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -28,6 +28,7 @@ dependencies: - openpyxl - pandas-gbq=0.12.0 - psycopg2=2.6.2 + - pyarrow>=0.13.0 # GH #35813 - pymysql=0.7.11 - pytables - python-dateutil From 10c9c023361bf5e763e2b9b807db9d6693a0f746 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 21 Aug 2020 02:07:33 -0700 Subject: [PATCH 43/83] Backport PR #35825: DOC: Start 1.1.2 (#35834) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.1.rst | 2 +- doc/source/whatsnew/v1.1.2.rst | 38 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 8ce10136dd2bb..1b5e63dfcf359 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.2 v1.1.1 v1.1.0 diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 721f07c865409..77ea67f76f655 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -53,4 +53,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.0..v1.1.1|HEAD +.. contributors:: v1.1.0..v1.1.1 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst new file mode 100644 index 0000000000000..81acd567027e5 --- /dev/null +++ b/doc/source/whatsnew/v1.1.2.rst @@ -0,0 +1,38 @@ +.. _whatsnew_112: + +What's new in 1.1.2 (??) +------------------------ + +These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.1..v1.1.2|HEAD From 4ca2bbddeb9c30bc5153926fdfb4029fd764da8b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 21 Aug 2020 02:08:07 -0700 Subject: [PATCH 44/83] Backport PR #35757: CI: Unpin Pytest + Pytest Asyncio Min Version (#35833) Co-authored-by: Ali McMaster --- ci/deps/azure-36-locale.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 3034ed3dc43af..536bb6f899773 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,9 +7,9 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 + - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-asyncio + - pytest-asyncio>=0.12.0 - hypothesis>=3.58.0 - pytest-azurepipelines From 4ed14923a51da4c0445500e50c78297877f3f53d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 24 Aug 2020 23:27:55 -0700 Subject: [PATCH 45/83] Backport PR #35877: REGR: DatetimeIndex.intersection incorrectly raising AssertionError (#35879) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/indexes/datetimelike.py | 4 ++-- pandas/tests/indexes/datetimes/test_setops.py | 7 +++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 81acd567027e5..97bd4dccdcd84 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - +- Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - - diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 15a7e25238983..ab0b3a394446d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -700,16 +700,16 @@ def intersection(self, other, sort=False): if result.freq is None: # TODO: no tests rely on this; needed? result = result._with_freq("infer") - assert result.name == res_name + result.name = res_name return result elif not self._can_fast_intersect(other): result = Index.intersection(self, other, sort=sort) - assert result.name == res_name # We need to invalidate the freq because Index.intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. result = result._with_freq(None)._with_freq("infer") + result.name = res_name return result # to make our life easier, "sort" the two ranges diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 6670b079ddd29..f19e78323ab23 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -470,6 +470,13 @@ def test_intersection_bug(self): tm.assert_index_equal(result, b) assert result.freq == b.freq + def test_intersection_list(self): + # GH#35876 + values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] + idx = pd.DatetimeIndex(values, name="a") + res = idx.intersection(values) + tm.assert_index_equal(res, idx) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone From 54ea0cd9b23fbf9a418e3527c0e2af0e637fbc54 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Aug 2020 12:04:01 +0200 Subject: [PATCH 46/83] TST: Fix test_parquet failures for pyarrow 1.0 (#35814) (#35887) Co-authored-by: Ali McMaster --- pandas/tests/io/test_parquet.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..306b2a7849586 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -557,13 +557,23 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # GH #26388 - # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 - # As per pyarrow partitioned columns become 'categorical' dtypes - # and are added to back of dataframe on read - expected_df = df_compat.copy() - if partition_col: - expected_df[partition_col] = expected_df[partition_col].astype("category") + + # GH #35791 + # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 + # Previous behaviour was pyarrow partitioned columns become 'category' dtypes + # These are added to back of dataframe on read. In new API category dtype is + # only used if partition field is string. + legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") + if partition_col and legacy_read_table: + partition_col_type = "category" + else: + partition_col_type = "int32" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) + check_round_trip( df_compat, pa, From c9fb7525abdc682e87df677f4d8315e1500a094f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 26 Aug 2020 14:32:26 +0100 Subject: [PATCH 47/83] BUG: DataFrame.apply with result_type=reduce incorrect index (#35905) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/apply.py | 5 ++++- pandas/tests/frame/apply/test_frame_apply.py | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 97bd4dccdcd84..748937deb5a9b 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6d44cf917a07a..99a9e1377563c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -340,7 +340,10 @@ def wrap_results_for_axis( if self.result_type == "reduce": # e.g. test_apply_dict GH#8735 - return self.obj._constructor_sliced(results) + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + elif self.result_type is None and all( isinstance(x, dict) for x in results.values() ): diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 538978358c8e7..5a1e448beb40f 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1541,3 +1541,12 @@ def func(row): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, result) + + +def test_apply_empty_list_reduce(): + # GH#35683 get columns correct + df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) + + result = df.apply(lambda x: [], result_type="reduce") + expected = pd.Series({"a": [], "b": []}, dtype=object) + tm.assert_series_equal(result, expected) From f86b129c81a1172f645fcecbadfd9750561ea5d6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 26 Aug 2020 15:40:21 +0100 Subject: [PATCH 48/83] Backport PR #35712: PERF: RangeIndex.format performance (#35904) Co-authored-by: Terji Petersen --- doc/source/whatsnew/v0.25.0.rst | 2 +- doc/source/whatsnew/v1.1.2.rst | 4 ++-- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 11 ++++++++--- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/range.py | 11 ++++++++++- pandas/tests/indexes/common.py | 10 ++++++++-- pandas/tests/indexes/period/test_period.py | 6 ++++++ pandas/tests/indexes/ranges/test_range.py | 12 ++++++++++++ 10 files changed, 52 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3cd920158f774..0f0f009307c75 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -540,7 +540,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 .. ipython:: python - df.describe() + df.describe() ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 748937deb5a9b..d60119f28c053 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) -- +- Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - .. --------------------------------------------------------------------------- @@ -25,7 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) -- +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1be381e38b157..32bbdf425acab 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -924,7 +924,9 @@ def format( return self._format_with_header(header, na_rep=na_rep) - def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: + def _format_with_header( + self, header: List[str_t], na_rep: str_t = "NaN" + ) -> List[str_t]: from pandas.io.formats.format import format_array values = self._values diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 74b235655e345..8af6ee555306a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -347,7 +347,7 @@ def _format_attrs(self): attrs.append(("length", len(self))) return attrs - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: from pandas.io.formats.printing import pprint_thing result = [ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index ab0b3a394446d..9b57a25f1b0e9 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -350,15 +350,20 @@ def format( """ header = [] if name: - fmt_name = ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) - header.append(fmt_name) + header.append( + ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, na_rep=na_rep, date_format=date_format) - def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: + def _format_with_header( + self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None + ) -> List[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9548ebbd9c3b2..446e57d58a779 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -948,7 +948,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: return header + list(self._format_native_types(na_rep=na_rep)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index eee610681087d..dcc0bdd86a98b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, Optional +from typing import Any, List, Optional import warnings import numpy as np @@ -195,6 +195,15 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: + if not len(self._range): + return header + first_val_str = str(self._range[0]) + last_val_str = str(self._range[-1]) + max_length = max(len(first_val_str), len(last_val_str)) + + return header + [f"{x:<{max_length}}" for x in self._range] + # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 3b41c4bfacf73..5f82203d92dc3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,5 +1,5 @@ import gc -from typing import Optional, Type +from typing import Type import numpy as np import pytest @@ -33,7 +33,7 @@ class Base: """ base class for index sub-class tests """ - _holder: Optional[Type[Index]] = None + _holder: Type[Index] _compat_props = ["shape", "ndim", "size", "nbytes"] def create_index(self) -> Index: @@ -648,6 +648,12 @@ def test_format(self): expected = [str(x) for x in idx] assert idx.format() == expected + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([]) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 15a88ab3819ce..085d41aaa5b76 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -536,6 +536,12 @@ def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): with pytest.raises(KeyError, match=msg): df.loc[key] + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([], freq="A") + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5b6f9cb358b7d..3bd3f6cc09db7 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -166,8 +166,14 @@ def test_cached_data(self): idx.any() assert idx._cached_data is None + idx.format() + assert idx._cache == {} + df = pd.DataFrame({"a": range(10)}, index=idx) + str(df) + assert idx._cache == {} + df.loc[50] assert idx._cached_data is None @@ -506,3 +512,9 @@ def test_engineless_lookup(self): idx.get_loc("a") assert "_engine" not in idx._cache + + def test_format_empty(self): + # GH35712 + empty_idx = self._holder(0) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] From d7a387bcd8ecb06fd6600dfd8a4f4dfc32db591a Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 27 Aug 2020 04:22:19 -0500 Subject: [PATCH 49/83] "Backport PR #35838 on branch 1.1.x" (#35915) --- doc/source/whatsnew/v1.1.2.rst | 2 ++ pandas/core/construction.py | 6 ++++-- pandas/core/dtypes/cast.py | 7 +++++-- pandas/tests/series/test_constructors.py | 15 +++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index d60119f28c053..5c4e770c7b33c 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,6 +24,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ + +- Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 47f10f1f65f4a..e8c9f28e50084 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -35,6 +35,7 @@ is_iterator, is_list_like, is_object_dtype, + is_sparse, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -535,9 +536,10 @@ def _try_cast( if maybe_castable(arr) and not copy and dtype is None: return arr - if isinstance(dtype, ExtensionDtype) and dtype.kind != "M": + if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime + # DatetimeTZ case needs to go through maybe_cast_to_datetime but + # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2697f42eb05a4..e6b4cb598989b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -50,6 +50,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_sparse, is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -1323,7 +1324,9 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64 and not is_dtype_equal(dtype, DT64NS_DTYPE): + if is_datetime64 and not is_dtype_equal( + getattr(dtype, "subtype", dtype), DT64NS_DTYPE + ): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] @@ -1355,7 +1358,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_scalar(value): if value == iNaT or isna(value): value = iNaT - else: + elif not is_sparse(value): value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1dd410ad02ee0..bcf7039ec9039 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1449,3 +1449,18 @@ def test_constructor_datetimelike_scalar_to_string_dtype(self): result = Series("M", index=[1, 2, 3], dtype="string") expected = pd.Series(["M", "M", "M"], index=[1, 2, 3], dtype="string") tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + [np.datetime64("2012-01-01"), np.datetime64("2013-01-01")], + ["2012-01-01", "2013-01-01"], + ], + ) + def test_constructor_sparse_datetime64(self, values): + # https://github.com/pandas-dev/pandas/issues/35762 + dtype = pd.SparseDtype("datetime64[ns]") + result = pd.Series(values, dtype=dtype) + arr = pd.arrays.SparseArray(values, dtype=dtype) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) From 08a090e0a27c9c813daca0d7b0a49513960c8a12 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 27 Aug 2020 11:37:06 +0100 Subject: [PATCH 50/83] Backport PR #35794: BUG: issubclass check with dtype instead of type, closes GH#24883 (#35919) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/computation/ops.py | 14 +++++++++++--- pandas/tests/frame/test_query_eval.py | 7 +++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 5c4e770c7b33c..a87e06678faad 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index bc9ff7c44b689..e55df1e1d8155 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -481,13 +481,21 @@ def stringify(value): self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) if ( - (self.lhs.is_scalar or self.rhs.is_scalar) + (lhs.is_scalar or rhs.is_scalar) and self.op in _bool_ops_dict and ( not ( - issubclass(self.rhs.return_type, (bool, np.bool_)) - and issubclass(self.lhs.return_type, (bool, np.bool_)) + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) ) ) ): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 628b955a1de92..56d178daee7fd 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -160,6 +160,13 @@ def test_eval_resolvers_as_list(self): assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + def test_eval_object_dtype_binop(self): + # GH#24883 + df = pd.DataFrame({"a1": ["Y", "N"]}) + res = df.eval("c = ((a1 == 'Y') & True)") + expected = pd.DataFrame({"a1": ["Y", "N"], "c": [True, False]}) + tm.assert_frame_equal(res, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): From aa407178d3cb578597fe33b19629a0c1822ef009 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 1 Sep 2020 07:14:38 -0700 Subject: [PATCH 51/83] Backport PR #36011: CI: suppress another setuptools warning (#36013) Co-authored-by: jbrockmendel --- pandas/tests/util/test_show_versions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 04e841c05e44a..fe5fc3e21d960 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -25,6 +25,7 @@ # https://github.com/pandas-dev/pandas/issues/35252 "ignore:Distutils:UserWarning" ) +@pytest.mark.filterwarnings("ignore:Setuptools is replacing distutils:UserWarning") def test_show_versions(capsys): # gh-32041 pd.show_versions() From 9ae9673489d89fc3771ff8934aefbf2ae0b212d7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 1 Sep 2020 15:16:35 +0100 Subject: [PATCH 52/83] Backport PR #35936: (REGR: Fix inplace updates on column to set correct values) (#36009) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/managers.py | 1 + pandas/tests/extension/test_numpy.py | 6 ++++++ pandas/tests/frame/test_block_internals.py | 14 ++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index a87e06678faad..b0d375a52f8ac 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) +- Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4c3805f812bb0..c4a866edba490 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1046,6 +1046,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ + value = extract_array(value, extract_numpy=True) # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 78000c0252375..11dc3069ee08e 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -354,6 +354,12 @@ def test_fillna_frame(self, data_missing): # Non-scalar "scalar" values. super().test_fillna_frame(data_missing) + @pytest.mark.skip("Invalid test") + def test_fillna_fill_other(self, data): + # inplace update doesn't work correctly with patched extension arrays + # extract_array returns PandasArray, while dtype is a numpy dtype + super().test_fillna_fill_other(data_missing) + class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): @pytest.mark.skip("Incorrect parent test") diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c9fec3215d57f..b8183eb9f4185 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -626,3 +626,17 @@ def test_add_column_with_pandas_array(self): assert type(df["c"]._mgr.blocks[0]) == ObjectBlock assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock tm.assert_frame_equal(df, df2) + + +def test_update_inplace_sets_valid_block_values(): + # https://github.com/pandas-dev/pandas/issues/33457 + df = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="category")}) + + # inplace update of a single column + df["a"].fillna(1, inplace=True) + + # check we havent put a Series into any block.values + assert isinstance(df._mgr.blocks[0].values, pd.Categorical) + + # smoketest for OP bug from GH#35731 + assert df.isnull().sum().sum() == 0 From 1253599cd01ce11ab10aaf749bdbfff21611b49c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 1 Sep 2020 16:26:04 +0100 Subject: [PATCH 53/83] CI: pin s3fs for Windows py37_np18 on 1.1.x (#36035) --- ci/deps/azure-windows-37.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 287d6877b9810..4d134b43760fe 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -29,7 +29,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.0,<0.5.0 - scipy - sqlalchemy - xlrd From 9c38e495269e8c9c3ba8dad486a77694b28794db Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 1 Sep 2020 09:57:16 -0700 Subject: [PATCH 54/83] Backport PR #35999: BUG: None in Float64Index raising TypeError, should return False (#36041) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/_libs/index.pyx | 6 +++++- pandas/tests/indexes/numeric/test_indexing.py | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index b0d375a52f8ac..ad5f647928738 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -29,7 +29,7 @@ Bug fixes - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) -- +- Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d6659cc1895b1..569562f5b5037 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -80,7 +80,11 @@ cdef class IndexEngine: values = self._get_index_values() self._check_type(val) - loc = _bin_search(values, val) # .searchsorted(val, side='left') + try: + loc = _bin_search(values, val) # .searchsorted(val, side='left') + except TypeError: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 473e370c76f8b..508bd2f566507 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -228,6 +228,12 @@ def test_take_fill_value_ints(self, klass): class TestContains: + @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index]) + def test_contains_none(self, klass): + # GH#35788 should return False, not raise TypeError + index = klass([0, 1, 2, 3, 4]) + assert None not in index + def test_contains_float64_nans(self): index = Float64Index([1.0, 2.0, np.nan]) assert np.nan in index From 736550d7277ac925015ad7652dee618c4d95d7f5 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 1 Sep 2020 11:29:07 -0700 Subject: [PATCH 55/83] Backport PR #35938: REGR: Fix comparison broadcasting over array of Intervals (#36039) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/_libs/interval.pyx | 5 +++++ pandas/tests/frame/methods/test_replace.py | 7 +++++++ pandas/tests/scalar/interval/test_arithmetic.py | 12 ++++++++++++ pandas/tests/scalar/interval/test_interval.py | 9 +++++++++ 5 files changed, 34 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index ad5f647928738..9be5b5f0ad2dc 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 6867e8aba7411..40bd5ad8f5a1f 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -358,6 +358,11 @@ cdef class Interval(IntervalMixin): self_tuple = (self.left, self.right, self.closed) other_tuple = (other.left, other.right, other.closed) return PyObject_RichCompare(self_tuple, other_tuple, op) + elif util.is_array(other): + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return NotImplemented diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8603bff0587b6..83dfd42ae2a6e 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1581,3 +1581,10 @@ def test_replace_with_compiled_regex(self): result = df.replace({regex: "z"}, regex=True) expected = pd.DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + + def test_replace_intervals(self): + # https://github.com/pandas-dev/pandas/issues/35931 + df = pd.DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = pd.DataFrame({"a": ["x", "x"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py index 5252f1a4d5a24..b4c2b448e252a 100644 --- a/pandas/tests/scalar/interval/test_arithmetic.py +++ b/pandas/tests/scalar/interval/test_arithmetic.py @@ -45,3 +45,15 @@ def test_numeric_interval_add_timedelta_raises(interval, delta): with pytest.raises((TypeError, ValueError), match=msg): delta + interval + + +@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) +def test_timdelta_add_timestamp_interval(klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index a0151bb9ac7bf..8ad9a2c7a9c70 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -2,6 +2,7 @@ import pytest from pandas import Interval, Period, Timedelta, Timestamp +import pandas._testing as tm import pandas.core.common as com @@ -267,3 +268,11 @@ def test_constructor_errors_tz(self, tz_left, tz_right): msg = "left and right must have the same time zone" with pytest.raises(error, match=msg): Interval(left, right) + + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) From f9fed6a2ef90cdca7bb58ebbaaf8766a5fd9b491 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 00:27:10 +0100 Subject: [PATCH 56/83] CI: pin setuptools on 1.1.x (#36048) --- ci/setup_env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index aa43d8b7dd00a..065f9e56ea171 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -148,7 +148,7 @@ python setup.py build_ext -q -i -j2 # - py35_compat # - py36_32bit echo "[Updating pip]" -python -m pip install --no-deps -U pip wheel setuptools +python -m pip install --no-deps -U pip wheel "setuptools<50.0.0" echo "[Install pandas]" python -m pip install --no-build-isolation -e . From 9bc223e025d829460f7848381b98311d4890ae90 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Sep 2020 09:37:25 -0700 Subject: [PATCH 57/83] Backport PR #35852: API: replace dropna=False option with na_sentinel=None in factorize (#36071) Co-authored-by: Kaiqi Dong --- doc/source/whatsnew/v1.1.2.rst | 8 ++++++ pandas/core/algorithms.py | 33 +++++++++++++++++++--- pandas/core/base.py | 2 +- pandas/core/groupby/grouper.py | 7 ++++- pandas/tests/base/test_factorize.py | 13 +++++++++ pandas/tests/test_algos.py | 44 ++++++----------------------- 6 files changed, 66 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 9be5b5f0ad2dc..dc7adf6d9d00e 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -34,6 +34,14 @@ Bug fixes .. --------------------------------------------------------------------------- +.. _whatsnew_112.other: + +Other +~~~~~ +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`) + +.. --------------------------------------------------------------------------- + .. _whatsnew_112.contributors: Contributors diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..856b4ead3f3cc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -525,9 +525,8 @@ def _factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int = -1, + na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, - dropna: bool = True, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -540,8 +539,11 @@ def factorize( Parameters ---------- {values}{sort} - na_sentinel : int, default -1 - Value to mark "not found". + na_sentinel : int or None, default -1 + Value to mark "not found". If None, will not drop the NaN + from the uniques of the values. + + .. versionchanged:: 1.1.2 {size_hint}\ Returns @@ -619,6 +621,22 @@ def factorize( array([0, 0, 1]...) >>> uniques Index(['a', 'c'], dtype='object') + + If NaN is in the values, and we want to include NaN in the uniques of the + values, it can be achieved by setting ``na_sentinel=None``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes + array([ 0, 1, 0, -1]) + >>> uniques + array([1., 2.]) + + >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes + array([0, 1, 0, 2]) + >>> uniques + array([ 1., 2., nan]) """ # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -632,6 +650,13 @@ def factorize( values = _ensure_arraylike(values) original = values + # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques + # of values, assign na_sentinel=-1 to replace code value for NaN. + dropna = True + if na_sentinel is None: + na_sentinel = -1 + dropna = False + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..1926803d8f04b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False): """ ), ) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs[ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..272afe7335c6a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -585,8 +585,13 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None + else: + na_sentinel = -1 codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, dropna=self.dropna + self.grouper, sort=self.sort, na_sentinel=na_sentinel ) uniques = Index(uniques, name=self.name) self._codes = codes diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 415a8b7e4362f..9fad9856d53cc 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) + + +def test_series_factorize_na_sentinel_none(): + # GH35667 + values = np.array([1, 2, 1, np.nan]) + ser = pd.Series(values) + codes, uniques = ser.factorize(na_sentinel=None) + + expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_uniques = pd.Index([1.0, 2.0, np.nan]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_index_equal(uniques, expected_uniques) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a080bf0feaebc..326c926238f89 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): tm.assert_extension_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( ["a", None, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", np.nan, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", None, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ], ) - def test_object_factorize_dropna( - self, data, dropna, expected_codes, expected_uniques + def test_object_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, dropna=dropna) + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( [1, None, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype="O"), - ), - ( - [1, np.nan, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype=np.float64), - ), - ( - [1, None, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype=np.float64), ), ], ) - def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques): - codes, uniques = algos.factorize(data, dropna=dropna) + def test_int_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques + ): + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) From 6c2a9d888f7cd56cd67d05fb022af3d5815b0dc6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Sep 2020 10:09:17 -0700 Subject: [PATCH 58/83] Backport PR #36051: BUG: frame._item_cache not cleared when Series is altered (#36072) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/generic.py | 4 ++++ pandas/tests/frame/test_missing.py | 15 +++++++++++---- .../tests/indexing/test_chaining_and_caching.py | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index dc7adf6d9d00e..17d51c9121f43 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -31,6 +31,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`36051`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8bd1dbea4696f..67e5759b39808 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3233,6 +3233,10 @@ def _maybe_update_cacher( if len(self) == len(ref): # otherwise, either self or ref has swapped in new arrays ref._maybe_cache_changed(cacher[0], self) + else: + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) if verify_is_copy: self._check_setitem_copy(stacklevel=5, t="referant") diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 9bf5d24085697..b4f91590e09d1 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -135,13 +135,20 @@ def test_drop_and_dropna_caching(self): df2 = df.copy() df["A"].dropna() tm.assert_series_equal(df["A"], original) - return_value = df["A"].dropna(inplace=True) - tm.assert_series_equal(df["A"], expected) + + ser = df["A"] + return_value = ser.dropna(inplace=True) + tm.assert_series_equal(ser, expected) + tm.assert_series_equal(df["A"], original) assert return_value is None + df2["A"].drop([1]) tm.assert_series_equal(df2["A"], original) - return_value = df2["A"].drop([1], inplace=True) - tm.assert_series_equal(df2["A"], original.drop([1])) + + ser = df2["A"] + return_value = ser.drop([1], inplace=True) + tm.assert_series_equal(ser, original.drop([1])) + tm.assert_series_equal(df2["A"], original) assert return_value is None def test_dropna_corner(self, float_frame): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index fa5fe5ba5c384..9910ef1b04b1a 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -81,6 +81,21 @@ def test_setitem_cache_updating(self): tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) + def test_altering_series_clears_parent_cache(self): + # GH #33675 + df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) + ser = df["A"] + + assert "A" in df._item_cache + + # Adding a new entry to ser swaps in a new array, so "A" needs to + # be removed from df._item_cache + ser["c"] = 5 + assert len(ser) == 3 + assert "A" not in df._item_cache + assert df["A"] is not ser + assert len(df["A"]) == 2 + class TestChaining: def test_setitem_chained_setfault(self): From 59a4f780d0f266be2a84f4e06f11d49f9a9c0d35 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 3 Sep 2020 11:02:49 -0700 Subject: [PATCH 59/83] Backport PR #36086: DOC: minor fixes to whatsnew\v1.1.2.rst (#36095) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 17d51c9121f43..8695ff8d11e6d 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -29,9 +29,9 @@ Bug fixes - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) -- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) -- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`36051`) +- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) .. --------------------------------------------------------------------------- @@ -39,7 +39,7 @@ Bug fixes Other ~~~~~ -- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`) +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) .. --------------------------------------------------------------------------- From 635361b282eca3abb9267f88bcef0f87897364f9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 4 Sep 2020 12:44:21 +0100 Subject: [PATCH 60/83] DOC: sync doc/source/whatsnew/v1.1.2.rst on 1.1.x (#36112) --- doc/source/whatsnew/v1.1.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 8695ff8d11e6d..ac9fe9d2fca26 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -20,6 +20,7 @@ Fixed regressions - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - + .. --------------------------------------------------------------------------- .. _whatsnew_112.bug_fixes: From 8d057b22ec04acf3c316ef5f1861f1ecd5580e5d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 4 Sep 2020 08:14:25 -0700 Subject: [PATCH 61/83] Backport PR #36061: BUG: groupby and agg on read-only array gives ValueError: buffer source array is read-only (#36117) Co-authored-by: Jeet Parekh <12874561+jeet-parekh@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/_libs/groupby.pyx | 32 ++++++++------- pandas/tests/groupby/aggregate/test_cython.py | 41 +++++++++++++++++++ 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index ac9fe9d2fca26..7195f3d7a3885 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -18,7 +18,7 @@ Fixed regressions - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) -- +- Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 38cb973d6dde9..a83634aad3ce2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -229,7 +229,7 @@ def group_cumprod_float64(float64_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cumsum(numeric[:, :] out, - numeric[:, :] values, + ndarray[numeric, ndim=2] values, const int64_t[:] labels, int ngroups, is_datetimelike, @@ -472,7 +472,7 @@ ctypedef fused complexfloating_t: @cython.boundscheck(False) def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - complexfloating_t[:, :] values, + ndarray[complexfloating_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -483,8 +483,9 @@ def _group_add(complexfloating_t[:, :] out, complexfloating_t val, count complexfloating_t[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if len(values) != len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -530,7 +531,7 @@ group_add_complex128 = _group_add['double complex'] @cython.boundscheck(False) def _group_prod(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -541,8 +542,9 @@ def _group_prod(floating[:, :] out, floating val, count floating[:, :] prodx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -582,7 +584,7 @@ group_prod_float64 = _group_prod['double'] @cython.cdivision(True) def _group_var(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1, int64_t ddof=1): @@ -591,10 +593,11 @@ def _group_var(floating[:, :] out, floating val, ct, oldmean floating[:, :] mean int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -639,7 +642,7 @@ group_var_float64 = _group_var['double'] @cython.boundscheck(False) def _group_mean(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): cdef: @@ -647,10 +650,11 @@ def _group_mean(floating[:, :] out, floating val, count floating[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -689,7 +693,7 @@ group_mean_float64 = _group_mean['double'] @cython.boundscheck(False) def _group_ohlc(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -740,7 +744,7 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, - numeric[:] values, + ndarray[numeric, ndim=1] values, ndarray[int64_t] labels, ndarray[uint8_t] mask, float64_t q, @@ -1072,7 +1076,7 @@ def group_nth(rank_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, :] out, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike, object ties_method="average", @@ -1424,7 +1428,7 @@ def group_min(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): @@ -1484,7 +1488,7 @@ def group_cummin(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5ddda264642de..87ebd8b5a27fb 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -236,3 +236,44 @@ def test_cython_with_timestamp_and_nat(op, data): result = df.groupby("a").aggregate(op) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "agg", + [ + "min", + "max", + "count", + "sum", + "prod", + "var", + "mean", + "median", + "ohlc", + "cumprod", + "cumsum", + "shift", + "any", + "all", + "quantile", + "first", + "last", + "rank", + "cummin", + "cummax", + ], +) +def test_read_only_buffer_source_agg(agg): + # https://github.com/pandas-dev/pandas/issues/36014 + df = DataFrame( + { + "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], + "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], + } + ) + df._mgr.blocks[0].values.flags.writeable = False + + result = df.groupby(["species"]).agg({"sepal_length": agg}) + expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) + + tm.assert_equal(result, expected) From 820218de154df7fd4a56180e88c2a720dd98c875 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 4 Sep 2020 10:49:49 -0700 Subject: [PATCH 62/83] Backport PR #36118: REGR: ensure closed attribute of IntervalIndex is preserved in pickle roundtrip (#36119) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.2.rst | 3 ++- pandas/core/indexes/interval.py | 2 +- pandas/tests/indexes/interval/test_interval.py | 7 +++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 7195f3d7a3885..232d0c4b4bbcd 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -18,8 +18,9 @@ Fixed regressions - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) +- Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) - +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 446e57d58a779..dcf89f24ebaf2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -190,7 +190,7 @@ def func(intvidx_self, other, sort=False): class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] - _attributes = ["name"] + _attributes = ["name", "closed"] # we would like our indexing holder to defer to us _defer_to_indexing = True diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 2755b186f3eae..a20e542b1edd7 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -874,6 +874,13 @@ def test_get_value_non_scalar_errors(self, key): with tm.assert_produces_warning(FutureWarning): idx.get_value(s, key) + @pytest.mark.parametrize("closed", ["left", "right", "both"]) + def test_pickle_round_trip_closed(self, closed): + # https://github.com/pandas-dev/pandas/issues/35658 + idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + def test_dir(): # GH#27571 dir(interval_index) should not raise From 45bf9110cb60ea0d6fe3db0d2053c46ad3eea721 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 4 Sep 2020 14:58:13 -0700 Subject: [PATCH 63/83] Backport PR #36050: BUG: incorrect year returned in isocalendar for certain dates (#36127) Co-authored-by: Asish Mahapatra --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/_libs/tslibs/ccalendar.pyx | 4 ++-- pandas/tests/series/test_datetime_values.py | 3 +++ pandas/tests/tslibs/test_ccalendar.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 232d0c4b4bbcd..39850905f60fa 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 6cce2f5e1fd95..d8c83daa661a3 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -201,10 +201,10 @@ cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil: iso_week = 1 iso_year = year - if iso_week == 1 and doy > 7: + if iso_week == 1 and month == 12: iso_year += 1 - elif iso_week >= 52 and doy < 7: + elif iso_week >= 52 and month == 1: iso_year -= 1 return iso_year, iso_week, dow + 1 diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index d2ad9c8c398ea..723bd303b1974 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -682,6 +682,9 @@ def test_setitem_with_different_tz(self): [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + # see GH#36032 + [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], + [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], ], ) def test_isocalendar(self, input_series, expected_output): diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index aab86d3a2df69..1ff700fdc23a3 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -1,10 +1,13 @@ from datetime import date, datetime +from hypothesis import given, strategies as st import numpy as np import pytest from pandas._libs.tslibs import ccalendar +import pandas as pd + @pytest.mark.parametrize( "date_tuple,expected", @@ -48,3 +51,15 @@ def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tu expected_from_date_isocalendar = date(*input_date_tuple).isocalendar() assert result == expected_from_date_isocalendar assert result == expected_iso_tuple + + +@given( + st.datetimes( + min_value=pd.Timestamp.min.to_pydatetime(warn=False), + max_value=pd.Timestamp.max.to_pydatetime(warn=False), + ) +) +def test_isocalendar(dt): + expected = dt.isocalendar() + result = ccalendar.get_iso_calendar(dt.year, dt.month, dt.day) + assert result == expected From 869e15a0384c1f9ff5a4906f231d5fe1beea5cdb Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 5 Sep 2020 00:56:13 -0700 Subject: [PATCH 64/83] Backport PR #36114: REGR: fix consolidation/cache issue with take operation (#36135) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/generic.py | 2 ++ pandas/tests/frame/test_block_internals.py | 23 ++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 39850905f60fa..d1a66256454ca 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 67e5759b39808..935bad2624637 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3342,6 +3342,8 @@ class max_speed nv.validate_take(tuple(), kwargs) + self._consolidate_inplace() + new_data = self._mgr.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b8183eb9f4185..217409d56c3ab 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -640,3 +640,26 @@ def test_update_inplace_sets_valid_block_values(): # smoketest for OP bug from GH#35731 assert df.isnull().sum().sum() == 0 + + +def test_nonconsolidated_item_cache_take(): + # https://github.com/pandas-dev/pandas/issues/35521 + + # create non-consolidated dataframe with object dtype columns + df = pd.DataFrame() + df["col1"] = pd.Series(["a"], dtype=object) + df["col2"] = pd.Series([0], dtype=object) + + # access column (item cache) + df["col1"] == "A" + # take operation + # (regression was that this consolidated but didn't reset item cache, + # resulting in an invalid cache and the .at operation not working properly) + df[df["col2"] == 0] + + # now setting value should update actual dataframe + df.at[0, "col1"] = "A" + + expected = pd.DataFrame({"col1": ["A"], "col2": [0]}, dtype=object) + tm.assert_frame_equal(df, expected) + assert df.at[0, "col1"] == "A" From 169a69237a4e3621ee8098293767c1cab05897a6 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 6 Sep 2020 08:13:32 -0700 Subject: [PATCH 65/83] Backport PR #35914: Make MultiIndex.get_loc raise for unhashable type (#36163) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/indexes/multi.py | 5 +++-- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++++++ pandas/tests/series/indexing/test_setitem.py | 11 ++++++++++- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index d1a66256454ca..0fa5dd30f8cd9 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 235da89083d0a..09504d50bbf40 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2681,6 +2681,8 @@ def get_loc(self, key, method=None): "currently supported for MultiIndex" ) + hash(key) + def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" if not isinstance(loc, np.ndarray) or loc.dtype != "int64": @@ -2695,8 +2697,7 @@ def _maybe_to_slice(loc): mask[loc] = True return mask - if not isinstance(key, (tuple, list)): - # not including list here breaks some indexing, xref #30892 + if not isinstance(key, tuple): loc = self._get_level_indexer(key, level=0) return _maybe_to_slice(loc) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d27487dfb8aaa..e4549dfb3e68d 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2111,7 +2111,7 @@ def test_type_error_multiindex(self): ) dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) - with pytest.raises(TypeError, match="is an invalid key"): + with pytest.raises(TypeError, match="unhashable type"): dg[:, 0] index = Index(range(2), name="i") diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 5e5fcd3db88d8..4565d79c632de 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning @@ -83,3 +84,10 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) + + def test_multiindex_get_loc_list_raises(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = pd.MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = "unhashable type" + with pytest.raises(TypeError, match=msg): + idx.get_loc([]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3463de25ad91b..593d1c78a19e2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,6 +1,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import MultiIndex, NaT, Series, date_range +import pandas.testing as tm class TestSetitemDT64Values: @@ -17,3 +18,11 @@ def test_setitem_none_nan(self): series[5:7] = np.nan assert series[6] is NaT + + def test_setitem_multiindex_empty_slice(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + result = Series([1, 2], index=idx) + expected = result.copy() + result.loc[[]] = 0 + tm.assert_series_equal(result, expected) From a61f1ada98ce2546647d9bd4c333ddc5c9125e22 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 6 Sep 2020 08:13:54 -0700 Subject: [PATCH 66/83] Backport PR #36054: BUG: Don't raise when constructing Series from ordered set (#36162) Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/construction.py | 9 ++++++--- pandas/tests/series/test_constructors.py | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 0fa5dd30f8cd9..dea3cb6f567ac 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -35,6 +35,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e8c9f28e50084..d1c174d48f04b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -437,7 +437,12 @@ def sanitize_array( subarr = subarr.copy() return subarr - elif isinstance(data, (list, tuple)) and len(data) > 0: + elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: + if isinstance(data, set): + # Raise only for unordered sets, e.g., not for dict_keys + raise TypeError("Set type is unordered") + data = list(data) + if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -449,8 +454,6 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - elif isinstance(data, abc.Set): - raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index bcf7039ec9039..ce078059479b4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1464,3 +1464,13 @@ def test_constructor_sparse_datetime64(self, values): arr = pd.arrays.SparseArray(values, dtype=dtype) expected = pd.Series(arr) tm.assert_series_equal(result, expected) + + def test_construction_from_ordered_collection(self): + # https://github.com/pandas-dev/pandas/issues/36044 + result = Series({"a": 1, "b": 2}.keys()) + expected = Series(["a", "b"]) + tm.assert_series_equal(result, expected) + + result = Series({"a": 1, "b": 2}.values()) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) From fdfe66cfac7421b85605e699daeca1e9b89959ef Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sun, 6 Sep 2020 12:17:53 -0500 Subject: [PATCH 67/83] CLN: backport mpl warning fix to 1.1.2 (#36155) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/plotting/test_frame.py | 10 ++++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index dea3cb6f567ac..6a735c38e2af1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -46,6 +46,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) +- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b490e07e43753..646ed09331278 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1232,8 +1232,8 @@ def get_label(i): if self._need_to_set_index: xticks = ax.get_xticks() xticklabels = [get_label(x) for x in xticks] - ax.set_xticklabels(xticklabels) ax.xaxis.set_major_locator(FixedLocator(xticks)) + ax.set_xticklabels(xticklabels) condition = ( not self._use_dynamic_x() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ee43e5d7072fe..9ab697cb57690 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2796,10 +2796,12 @@ def test_table(self): _check_plot_works(df.plot, table=True) _check_plot_works(df.plot, table=df) - ax = df.plot() - assert len(ax.tables) == 0 - plotting.table(ax, df.T) - assert len(ax.tables) == 1 + # GH 35945 UserWarning + with tm.assert_produces_warning(None): + ax = df.plot() + assert len(ax.tables) == 0 + plotting.table(ax, df.T) + assert len(ax.tables) == 1 def test_errorbar_scatter(self): df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) From ce2f879e0f19bfee1dc386808e7e607b293861ac Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 6 Sep 2020 11:06:23 -0700 Subject: [PATCH 68/83] Backport PR #35979: BUG: Respect errors="ignore" during extension astype (#36167) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/blocks.py | 9 ++++++-- pandas/tests/frame/methods/test_astype.py | 22 +++++++++++++++++++ pandas/tests/series/methods/test_astype.py | 25 +++++++++++++++++++++- 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 6a735c38e2af1..46696c335b1a8 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) +- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` not respecting the ``errors`` argument when set to ``"ignore"`` for extension dtypes (:issue:`35471`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) - Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f4f4a3666a84e..ea2b0c972d9aa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -565,8 +565,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # force the copy here if self.is_extension: - # TODO: Should we try/except this astype? - values = self.values.astype(dtype) + try: + values = self.values.astype(dtype) + except (ValueError, TypeError): + if errors == "ignore": + values = self.values + else: + raise else: if issubclass(dtype.type, str): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b0fd0496ea81e..d3f256259b15f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -8,6 +8,7 @@ CategoricalDtype, DataFrame, DatetimeTZDtype, + Interval, IntervalDtype, NaT, Series, @@ -565,3 +566,24 @@ def test_astype_empty_dtype_dict(self): result = df.astype(dict()) tm.assert_frame_equal(result, df) assert result is not df + + @pytest.mark.parametrize( + "df", + [ + DataFrame(Series(["x", "y", "z"], dtype="string")), + DataFrame(Series(["x", "y", "z"], dtype="category")), + DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), + DataFrame(Series(3 * [Interval(0, 1)])), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = df + result = df.astype(float, errors=errors) + tm.assert_frame_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + df.astype(float, errors=errors) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 9fdc4179de2e1..b9d90a9fc63dd 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,4 +1,6 @@ -from pandas import Series, date_range +import pytest + +from pandas import Interval, Series, Timestamp, date_range import pandas._testing as tm @@ -23,3 +25,24 @@ def test_astype_dt64tz_to_str(self): dtype=object, ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + Series(["x", "y", "z"], dtype="string"), + Series(["x", "y", "z"], dtype="category"), + Series(3 * [Timestamp("2020-01-01", tz="UTC")]), + Series(3 * [Interval(0, 1)]), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = values + result = values.astype(float, errors="ignore") + tm.assert_series_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + values.astype(float, errors=errors) From d0a39d1157300cbeafddae6130c59770f0574d9a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 6 Sep 2020 11:07:55 -0700 Subject: [PATCH 69/83] Backport PR #36115: REGR: append tz-aware DataFrame with tz-naive values (#36166) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/dtypes/concat.py | 6 ++++-- pandas/core/internals/concat.py | 8 ++++++-- pandas/tests/reshape/test_concat.py | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 46696c335b1a8..a0e57eb1729e1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) +- Fix regression in :meth:`DataFrame.append` mixing tz-aware and tz-naive datetime columns (:issue:`35460`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9902016475b22..dd005752a4832 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -148,15 +148,17 @@ def is_nonempty(x) -> bool: any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: + # we ignore axis here, as internally concatting with EAs is always + # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if isinstance(to_concat[0], ExtensionArray) and axis == 0: + if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat, axis=axis) + return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2c0d4931a7bf2..5d06cb4c48ac3 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -23,7 +23,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -334,9 +334,13 @@ def _concatenate_join_units(join_units, concat_axis, copy): # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0) - if not isinstance(concat_values, ExtensionArray): + if not isinstance(concat_values, ExtensionArray) or ( + isinstance(concat_values, DatetimeArray) and concat_values.tz is None + ): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block + # special case DatetimeArray, which *is* an EA, but is put in a + # consolidated 2D block concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis,) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0159fabd04d59..53d3a35328b68 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1110,6 +1110,23 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df.append([s, s], ignore_index=True) tm.assert_frame_equal(result, expected) + def test_append_empty_tz_frame_with_datetime64ns(self): + # https://github.com/pandas-dev/pandas/issues/35460 + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + + # pd.NaT gets inferred as tz-naive, so append result is tz-naive + result = df.append({"a": pd.NaT}, ignore_index=True) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # also test with typed value to append + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + result = df.append( + pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True + ) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + class TestConcatenate: def test_concat_copy(self): From a8ef12bb4aab4c02c70feabe2b2a672121a31c67 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sun, 6 Sep 2020 19:24:30 +0100 Subject: [PATCH 70/83] BUG GH31355 fixed by adding more relevant error message --- pandas/core/reshape/util.py | 5 +++++ pandas/tests/reshape/test_util.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index a1bf3f8ee4119..fc1eb6d488b91 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -40,6 +40,11 @@ def cartesian_product(X): cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) + + msg = "Product space too large to allocate arrays!" + if np.any(cumprodX < 0): + raise ValueError(msg) + a[0] = 1 if cumprodX[-1] != 0: diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 9d074b5ade425..0acadc54cec0c 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -65,3 +65,13 @@ def test_invalid_input(self, X): with pytest.raises(TypeError, match=msg): cartesian_product(X=X) + + def test_exceed_product_space(self): + # GH31355: raise useful error when produce space is too large + msg = "Product space too large to allocate arrays!" + + with pytest.raises(ValueError, match=msg): + dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [ + (np.arange(15128, dtype=np.int16)), + ] + cartesian_product(X=dims) From b3dca88d31d0f463932713bab92a0953f4adf683 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sun, 6 Sep 2020 19:46:36 +0100 Subject: [PATCH 71/83] formatted code --- pandas/core/reshape/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index fc1eb6d488b91..eac28e79ff897 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -40,11 +40,11 @@ def cartesian_product(X): cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) - + msg = "Product space too large to allocate arrays!" if np.any(cumprodX < 0): raise ValueError(msg) - + a[0] = 1 if cumprodX[-1] != 0: From a71596491cb630a00c2da3dd961d26d957b0ab69 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 7 Sep 2020 12:06:14 -0700 Subject: [PATCH 72/83] Backport PR #35882: BUG: item_cache invalidation in get_numeric_data (#36188) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/managers.py | 1 - pandas/tests/frame/methods/test_cov_corr.py | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index a0e57eb1729e1..8a4574af8f6a9 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -40,6 +40,7 @@ Bug fixes - Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) +- Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c4a866edba490..3f3f0c68cb1ed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -732,7 +732,6 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() return self._combine([b for b in self.blocks if b.is_numeric], copy) def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d3548b639572d..f307acd8c2178 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -191,6 +191,23 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) + def test_corr_item_cache(self): + # Check that corr does not lead to incorrect entries in item_cache + + df = pd.DataFrame({"A": range(10)}) + df["B"] = range(10)[::-1] + + ser = df["A"] # populate item_cache + assert len(df._mgr.blocks) == 2 + + _ = df.corr() + + # Check that the corr didnt break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + assert df["A"] is ser + assert df.values[0, 0] == 99 + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): From 89057f458c189f361f1d8107dbebbb1fd51a04ee Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 7 Sep 2020 13:35:06 -0700 Subject: [PATCH 73/83] Backport PR #36191: TST: update test_series_factorize_na_sentinel_none for 32bit (#36200) Co-authored-by: Simon Hawkins --- pandas/tests/base/test_factorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 9fad9856d53cc..f8cbadb987d29 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -34,7 +34,7 @@ def test_series_factorize_na_sentinel_none(): ser = pd.Series(values) codes, uniques = ser.factorize(na_sentinel=None) - expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = pd.Index([1.0, 2.0, np.nan]) tm.assert_numpy_array_equal(codes, expected_codes) From bb5cf8ed84635544a844f38d852b1aea10b3c92b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 21:35:18 +0100 Subject: [PATCH 74/83] =?UTF-8?q?BUG:=20shows=20correct=20package=20name?= =?UTF-8?q?=20when=20import=5Foptional=5Fdependency=20is=20ca=E2=80=A6=20(?= =?UTF-8?q?#36134)=20(#36199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Harsh Sharma <51477130+hs2361@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/compat/_optional.py | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 8a4574af8f6a9..e9cba3de56920 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -41,6 +41,7 @@ Bug fixes - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) +- Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6423064732def..9a5e54dd31171 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -33,6 +33,19 @@ "numba": "0.46.0", } +# A mapping from import name to package name (on PyPI) for packages where +# these two names are different. + +INSTALL_MAPPING = { + "bs4": "beautifulsoup4", + "bottleneck": "Bottleneck", + "lxml.etree": "lxml", + "odf": "odfpy", + "pandas_gbq": "pandas-gbq", + "sqlalchemy": "SQLAlchemy", + "jinja2": "Jinja2", +} + def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -82,9 +95,13 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + + package_name = INSTALL_MAPPING.get(name) + install_name = package_name if package_name is not None else name + msg = ( - f"Missing optional dependency '{name}'. {extra} " - f"Use pip or conda to install {name}." + f"Missing optional dependency '{install_name}'. {extra} " + f"Use pip or conda to install {install_name}." ) try: module = importlib.import_module(name) From c89924776a175c965edad3e5edb2e2a81edb4444 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 7 Sep 2020 15:10:02 -0700 Subject: [PATCH 75/83] Backport PR #36152: Fix compressed multiindex for output of groupby.rolling (#36203) Co-authored-by: patrick <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/window/rolling.py | 10 +++++----- pandas/tests/window/test_grouper.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index e9cba3de56920..28ce49c11b3f0 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) +- Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index d727881f8285a..0c940e5d1b82d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2197,17 +2197,17 @@ def _apply( # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names grouped_object_index = self.obj.index - grouped_index_name = [grouped_object_index.name] + grouped_index_name = [*grouped_object_index.names] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name result_index_data = [] for key, values in self._groupby.grouper.indices.items(): for value in values: - if not is_list_like(key): - data = [key, grouped_object_index[value]] - else: - data = [*key, grouped_object_index[value]] + data = [ + *com.maybe_make_list(key), + *com.maybe_make_list(grouped_object_index[value]), + ] result_index_data.append(tuple(data)) result_index = MultiIndex.from_tuples( diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index d0a62374d0888..a46a72bde8d6d 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -383,3 +383,24 @@ def test_groupby_subset_rolling_subset_with_closed(self): name="column1", ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["max", "min"]) + def test_groupby_rolling_index_changed(self, func): + # GH: #36018 nlevels of MultiIndex changed + ds = Series( + [1, 2, 2], + index=pd.MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"] + ), + name="a", + ) + + result = getattr(ds.groupby(ds).rolling(2), func)() + expected = Series( + [np.nan, np.nan, 2.0], + index=pd.MultiIndex.from_tuples( + [(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"] + ), + name="a", + ) + tm.assert_series_equal(result, expected) From 9bb20344ed8c7a61f0d59925656cf4ecf6e0b087 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 8 Sep 2020 03:49:22 -0700 Subject: [PATCH 76/83] Backport PR #36208: BUG: GroupbyRolling with an empty frame (#36213) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/window/rolling.py | 10 ++++++---- pandas/tests/window/test_grouper.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 28ce49c11b3f0..f13d38d1f8f76 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) - Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) -- +- Fixed regression in :class:`DataFrameGroupBy` on an empty :class:`DataFrame` (:issue:`36197`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0c940e5d1b82d..f7bcd1e795fd3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2226,10 +2226,12 @@ def _create_blocks(self, obj: FrameOrSeries): """ # Ensure the object we're rolling over is monotonically sorted relative # to the groups - groupby_order = np.concatenate( - list(self._groupby.grouper.indices.values()) - ).astype(np.int64) - obj = obj.take(groupby_order) + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate( + list(self._groupby.grouper.indices.values()) + ).astype(np.int64) + obj = obj.take(groupby_order) return super()._create_blocks(obj) def _get_cython_func_type(self, func: str) -> Callable: diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index a46a72bde8d6d..97f3e50edf211 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -404,3 +404,15 @@ def test_groupby_rolling_index_changed(self, func): name="a", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_empty_frame(self): + # GH 36197 + expected = pd.DataFrame({"s1": []}) + result = expected.groupby("s1").rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({"s1": [], "s2": []}) + result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) + tm.assert_frame_equal(result, expected) From d8da3200a922a5066bb60d9f43f8f822cace5972 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 8 Sep 2020 04:05:51 -0700 Subject: [PATCH 77/83] Backport PR #36205: DOC: doc fix (#36215) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index f13d38d1f8f76..0e4a88f3ee56b 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -51,7 +51,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) -- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) +- :meth:`DataFrame.plot` and :meth:`Series.plot` raise ``UserWarning`` about usage of ``FixedFormatter`` and ``FixedLocator`` (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- From e6fd1b6564b6f08fe08f19ddb1d8b58fed08d83a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 8 Sep 2020 05:34:56 -0700 Subject: [PATCH 78/83] Backport PR #36182: DOC: release date for 1.1.2 (#36217) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.2.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 0e4a88f3ee56b..a214ad9762733 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_112: -What's new in 1.1.2 (??) ------------------------- +What's new in 1.1.2 (September 8, 2020) +--------------------------------------- These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog including other versions of pandas. From 2afdb3da2591342af5d0c13adee7564d5eb8800b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 15:20:26 +0100 Subject: [PATCH 79/83] TST: skip test_groupby_rolling_index_changed on 32bit (#36220) --- pandas/tests/window/test_grouper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 97f3e50edf211..806c22c60b48f 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -384,6 +384,7 @@ def test_groupby_subset_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") @pytest.mark.parametrize("func", ["max", "min"]) def test_groupby_rolling_index_changed(self, func): # GH: #36018 nlevels of MultiIndex changed From ee6e91c154ee33d8a23e86df4450d1ad1585715c Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 8 Sep 2020 07:55:07 -0700 Subject: [PATCH 80/83] Backport PR #36141: BUG: copying series into empty dataframe does not preserve dataframe index name (#36221) Co-authored-by: Irv Lustig --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/frame.py | 8 +++++--- pandas/tests/indexing/test_partial.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index a214ad9762733..c6a08f4fb852a 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -43,6 +43,7 @@ Bug fixes - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) - Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) +- Bug when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`31368`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 041121d60ad33..9af9c19392ef7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3170,9 +3170,11 @@ def _ensure_valid_index(self, value): "and a value that cannot be converted to a Series" ) from err - self._mgr = self._mgr.reindex_axis( - value.index.copy(), axis=1, fill_value=np.nan - ) + # GH31368 preserve name of index + index_copy = value.index.copy() + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) def _box_col_values(self, values, loc: int) -> Series: """ diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 350f86b4e9fd0..7afbbc2b9ab2b 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -660,3 +660,15 @@ def test_indexing_timeseries_regression(self): expected = Series(rng, index=rng) tm.assert_series_equal(result, expected) + + def test_index_name_empty(self): + # GH 31368 + df = pd.DataFrame({}, index=pd.RangeIndex(0, name="df_index")) + series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + + df["series"] = series + expected = pd.DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + ) + + tm.assert_frame_equal(df, expected) From 2a7d3326dee660824a8433ffd01065f8ac37f7d6 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Tue, 8 Sep 2020 16:28:24 +0000 Subject: [PATCH 81/83] RLS: 1.1.2 From f757f62df93f69c53e412f362a12a1b67f27fd77 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 9 Sep 2020 02:28:46 -0700 Subject: [PATCH 82/83] Backport PR #36183: DOC: Start 1.1.3 (#36242) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.2.rst | 2 +- doc/source/whatsnew/v1.1.3.rst | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1b5e63dfcf359..33c0750c1dc16 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.3 v1.1.2 v1.1.1 v1.1.0 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c6a08f4fb852a..81b8e7df11625 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -61,4 +61,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.1..v1.1.2|HEAD +.. contributors:: v1.1.1..v1.1.2 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst new file mode 100644 index 0000000000000..e3161012da5d1 --- /dev/null +++ b/doc/source/whatsnew/v1.1.3.rst @@ -0,0 +1,42 @@ +.. _whatsnew_113: + +What's new in 1.1.3 (??) +------------------------ + +These are the changes in pandas 1.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.2..v1.1.3|HEAD From 7a66dbbf8c68434e720b26e5fd5e1e6806da1483 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sun, 13 Sep 2020 12:50:25 +0100 Subject: [PATCH 83/83] alter position of error message being raised --- pandas/core/reshape/util.py | 4 ++++ pandas/tests/reshape/test_util.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 6949270317f7c..a80cb795a1c40 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -39,6 +39,10 @@ def cartesian_product(X): lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) + msg = "Product space too large to allocate arrays!" + if np.any(cumprodX < 0): + raise ValueError(msg) + a = np.roll(cumprodX, 1) a[0] = 1 diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 9d074b5ade425..0acadc54cec0c 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -65,3 +65,13 @@ def test_invalid_input(self, X): with pytest.raises(TypeError, match=msg): cartesian_product(X=X) + + def test_exceed_product_space(self): + # GH31355: raise useful error when produce space is too large + msg = "Product space too large to allocate arrays!" + + with pytest.raises(ValueError, match=msg): + dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [ + (np.arange(15128, dtype=np.int16)), + ] + cartesian_product(X=dims)