From 9aa2645ec31fe59d9efcd4c384bc2767f3fd4c97 Mon Sep 17 00:00:00 2001 From: clement Date: Thu, 9 Apr 2020 23:16:28 +0100 Subject: [PATCH 1/3] Modify heading capitalizations of files in doc/source/whatsnew (part4) Add exceptions to the list in 'scripts/validate_rst_title_capitalization.py' --- doc/source/whatsnew/v0.14.0.rst | 8 ++++---- doc/source/whatsnew/v0.14.1.rst | 4 ++-- doc/source/whatsnew/v0.15.0.rst | 12 ++++++------ doc/source/whatsnew/v0.15.1.rst | 4 ++-- doc/source/whatsnew/v0.15.2.rst | 4 ++-- doc/source/whatsnew/v0.16.0.rst | 6 +++--- doc/source/whatsnew/v0.16.1.rst | 4 ++-- doc/source/whatsnew/v0.16.2.rst | 4 ++-- doc/source/whatsnew/v0.17.0.rst | 12 ++++++------ doc/source/whatsnew/v0.17.1.rst | 4 ++-- scripts/validate_rst_title_capitalization.py | 10 ++++++++++ 11 files changed, 41 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 25a75492d78fb..e8dc153d2b92b 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0140: -v0.14.0 (May 31 , 2014) ------------------------ +Version 0.14.0 (May 31 , 2014) +------------------------------ {{ header }} @@ -321,7 +321,7 @@ Text parsing API changes .. _whatsnew_0140.groupby: -Groupby API changes +GroupBy API changes ~~~~~~~~~~~~~~~~~~~ More consistent behavior for some groupby methods: @@ -904,7 +904,7 @@ There are no experimental changes in 0.14.0 .. _whatsnew_0140.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in Series ValueError when index doesn't match data (:issue:`6532`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 26018c5745a11..3dfc4272681df 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0141: -v0.14.1 (July 11, 2014) ------------------------ +Version 0.14.1 (July 11, 2014) +------------------------------ {{ header }} diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index 95e354e425143..b80ed7446f805 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0150: -v0.15.0 (October 18, 2014) --------------------------- +Version 0.15.0 (October 18, 2014) +--------------------------------- {{ header }} @@ -105,7 +105,7 @@ For full docs, see the :ref:`categorical introduction ` and the .. _whatsnew_0150.timedeltaindex: -TimedeltaIndex/Scalar +TimedeltaIndex/scalar ^^^^^^^^^^^^^^^^^^^^^ We introduce a new scalar type ``Timedelta``, which is a subclass of ``datetime.timedelta``, and behaves in a similar manner, @@ -247,8 +247,8 @@ Additionally :meth:`~pandas.DataFrame.memory_usage` is an available method for a .. _whatsnew_0150.dt: -.dt accessor -^^^^^^^^^^^^ +Series.dt accessor +^^^^^^^^^^^^^^^^^^ ``Series`` has gained an accessor to succinctly return datetime like properties for the *values* of the Series, if its a datetime/period like Series. (:issue:`7207`) This will return a Series, indexed like the existing Series. See the :ref:`docs ` @@ -600,7 +600,7 @@ Rolling/expanding moments improvements .. _whatsnew_0150.sql: -Improvements in the sql io module +Improvements in the SQL IO module ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added support for a ``chunksize`` parameter to ``to_sql`` function. This allows DataFrame to be written in chunks and avoid packet-size overflow errors (:issue:`8062`). diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index 2e036267b5804..f9c17058dc3ee 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0151: -v0.15.1 (November 9, 2014) --------------------------- +Version 0.15.1 (November 9, 2014) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index 292351c709940..a4eabb97471de 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0152: -v0.15.2 (December 12, 2014) ---------------------------- +Version 0.15.2 (December 12, 2014) +---------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 855d0b8695bb1..4ad533e68e275 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0160: -v0.16.0 (March 22, 2015) ------------------------- +Version 0.16.0 (March 22, 2015) +------------------------------- {{ header }} @@ -218,7 +218,7 @@ Backwards incompatible API changes .. _whatsnew_0160.api_breaking.timedelta: -Changes in Timedelta +Changes in timedelta ^^^^^^^^^^^^^^^^^^^^ In v0.15.0 a new scalar type ``Timedelta`` was introduced, that is a diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 502c1287efdbe..8dcac4c1044be 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0161: -v0.16.1 (May 11, 2015) ----------------------- +Version 0.16.1 (May 11, 2015) +----------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index 543f9c6bbf300..a3c34db09f555 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0162: -v0.16.2 (June 12, 2015) ------------------------ +Version 0.16.2 (June 12, 2015) +------------------------------ {{ header }} diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 67abad659dc8d..11c252192be6b 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0170: -v0.17.0 (October 9, 2015) -------------------------- +Version 0.17.0 (October 9, 2015) +-------------------------------- {{ header }} @@ -181,8 +181,8 @@ Each method signature only includes relevant arguments. Currently, these are lim Additional methods for ``dt`` accessor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -strftime -"""""""" +Series.dt.strftime +"""""""""""""""""" We are now supporting a ``Series.dt.strftime`` method for datetime-likes to generate a formatted string (:issue:`10110`). Examples: @@ -202,8 +202,8 @@ We are now supporting a ``Series.dt.strftime`` method for datetime-likes to gene The string format is as the python standard library and details can be found `here `_ -total_seconds -""""""""""""" +Series.dt.total_seconds +""""""""""""""""""""""" ``pd.Series`` of type ``timedelta64`` has new method ``.dt.total_seconds()`` returning the duration of the timedelta in seconds (:issue:`10817`) diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index 55080240f2a55..5d15a01aee5a0 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0171: -v0.17.1 (November 21, 2015) ---------------------------- +Version 0.17.1 (November 21, 2015) +---------------------------------- {{ header }} diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 3d19e37ac7a1d..5e62a98de548d 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -42,6 +42,7 @@ "Arrow", "Parquet", "MultiIndex", + "MultiIndexing", "NumFOCUS", "sklearn", "Docker", @@ -58,6 +59,7 @@ "DatetimeIndex", "IntervalIndex", "CategoricalIndex", + "Categorical", "GroupBy", "SPSS", "ORC", @@ -99,6 +101,14 @@ "BusinessHour", "BusinessDay", "DateOffset", + "TZ", + "GIL", + "strftime", + "XPORT", + "Unicode", + "East", + "Asian", + "None", } CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} From aaf4396be7b75b320bfd3f54b8c9e240c05edb98 Mon Sep 17 00:00:00 2001 From: clement Date: Tue, 14 Apr 2020 18:17:07 +0100 Subject: [PATCH 2/3] Merge master, fix merging conflicts --- README.md | 1 + asv_bench/asv.conf.json | 2 +- asv_bench/benchmarks/array.py | 18 + ci/code_checks.sh | 8 +- ci/deps/azure-36-minimum_versions.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 3 +- ci/deps/azure-macos-36.yaml | 2 +- doc/source/conf.py | 1 + doc/source/getting_started/install.rst | 2 +- doc/source/user_guide/io.rst | 31 +- doc/source/user_guide/timeseries.rst | 9 + doc/source/whatsnew/index.rst | 2 +- doc/source/whatsnew/v0.10.0.rst | 6 +- doc/source/whatsnew/v0.10.1.rst | 4 +- doc/source/whatsnew/v0.11.0.rst | 4 +- doc/source/whatsnew/v0.12.0.rst | 8 +- doc/source/whatsnew/v0.13.0.rst | 4 +- doc/source/whatsnew/v0.13.1.rst | 4 +- doc/source/whatsnew/v0.14.0.rst | 2 +- doc/source/whatsnew/v0.18.0.rst | 2 +- doc/source/whatsnew/v0.18.1.rst | 2 +- doc/source/whatsnew/v0.19.0.rst | 6 +- doc/source/whatsnew/v0.20.0.rst | 6 +- doc/source/whatsnew/v0.24.0.rst | 4 +- doc/source/whatsnew/v0.25.0.rst | 4 +- doc/source/whatsnew/v0.4.x.rst | 4 +- doc/source/whatsnew/v0.5.0.rst | 4 +- doc/source/whatsnew/v0.6.0.rst | 4 +- doc/source/whatsnew/v0.6.1.rst | 4 +- doc/source/whatsnew/v0.7.0.rst | 4 +- doc/source/whatsnew/v0.7.1.rst | 4 +- doc/source/whatsnew/v0.7.2.rst | 4 +- doc/source/whatsnew/v0.7.3.rst | 6 +- doc/source/whatsnew/v0.8.0.rst | 6 +- doc/source/whatsnew/v0.8.1.rst | 4 +- doc/source/whatsnew/v0.9.0.rst | 4 +- doc/source/whatsnew/v0.9.1.rst | 4 +- doc/source/whatsnew/v1.0.0.rst | 8 +- doc/source/whatsnew/v1.1.0.rst | 48 +- environment.yml | 2 +- pandas/_config/config.py | 18 +- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/lib.pyx | 64 +- pandas/_libs/reshape.pyx | 2 +- pandas/_libs/tslibs/__init__.py | 19 +- pandas/_libs/tslibs/c_timestamp.pyx | 12 + pandas/_libs/tslibs/ccalendar.pxd | 2 + pandas/_libs/tslibs/ccalendar.pyx | 54 +- pandas/_libs/tslibs/fields.pyx | 43 +- pandas/_libs/tslibs/timedeltas.pyx | 52 +- pandas/_typing.py | 4 + pandas/compat/__init__.py | 4 +- pandas/core/array_algos/masked_reductions.py | 48 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/boolean.py | 26 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 4 +- pandas/core/arrays/datetimes.py | 53 +- pandas/core/arrays/integer.py | 22 +- pandas/core/arrays/interval.py | 12 +- pandas/core/arrays/masked.py | 11 + pandas/core/arrays/period.py | 4 +- pandas/core/computation/expr.py | 3 +- pandas/core/frame.py | 109 ++- pandas/core/generic.py | 173 ++-- pandas/core/groupby/generic.py | 58 +- pandas/core/groupby/groupby.py | 8 +- pandas/core/groupby/grouper.py | 55 +- pandas/core/indexes/accessors.py | 31 + pandas/core/indexes/base.py | 44 +- pandas/core/indexes/datetimes.py | 1 + pandas/core/indexes/multi.py | 22 +- pandas/core/indexing.py | 7 +- pandas/core/internals/__init__.py | 2 - pandas/core/internals/blocks.py | 107 +-- pandas/core/internals/managers.py | 133 +-- pandas/core/ops/__init__.py | 46 +- pandas/core/ops/docstrings.py | 19 +- pandas/core/resample.py | 4 +- pandas/core/reshape/concat.py | 12 +- pandas/core/reshape/reshape.py | 45 +- pandas/core/series.py | 91 +- pandas/core/strings.py | 2 +- pandas/io/common.py | 27 +- pandas/io/feather_format.py | 9 +- pandas/io/json/_normalize.py | 4 +- pandas/io/pytables.py | 4 +- pandas/io/stata.py | 5 +- pandas/plotting/_matplotlib/core.py | 6 +- pandas/plotting/_matplotlib/hist.py | 13 +- pandas/tests/arithmetic/test_datetime64.py | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 5 +- .../arrays/categorical/test_analytics.py | 8 + .../tests/arrays/integer/test_construction.py | 2 +- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/arrays/interval/test_interval.py | 13 +- pandas/tests/arrays/test_datetimelike.py | 17 +- pandas/tests/arrays/test_period.py | 1 + pandas/tests/extension/base/casting.py | 14 +- pandas/tests/extension/base/getitem.py | 28 + pandas/tests/extension/base/reshaping.py | 8 + pandas/tests/extension/test_integer.py | 5 +- pandas/tests/extension/test_numpy.py | 25 +- pandas/tests/extension/test_sparse.py | 22 +- pandas/tests/frame/methods/test_diff.py | 64 +- pandas/tests/frame/test_analytics.py | 53 -- pandas/tests/frame/test_query_eval.py | 5 + pandas/tests/generic/methods/test_dot.py | 128 +++ pandas/tests/generic/test_finalize.py | 782 ++++++++++++++++++ pandas/tests/groupby/test_apply.py | 21 + pandas/tests/groupby/test_apply_mutate.py | 70 ++ pandas/tests/groupby/test_categorical.py | 12 + pandas/tests/groupby/test_groupby.py | 45 - .../tests/indexes/datetimes/test_to_period.py | 24 + pandas/tests/indexes/multi/test_indexing.py | 4 - .../tests/indexes/ranges/test_constructors.py | 2 +- .../tests/indexing/multiindex/test_getitem.py | 4 +- .../tests/indexing/multiindex/test_partial.py | 31 +- .../tests/indexing/multiindex/test_setitem.py | 7 +- .../tests/io/json/test_json_table_schema.py | 18 + pandas/tests/io/test_compression.py | 41 + pandas/tests/io/test_feather.py | 21 +- pandas/tests/io/test_html.py | 8 +- pandas/tests/plotting/test_frame.py | 33 + pandas/tests/reductions/test_reductions.py | 24 + .../tests/scalar/timedelta/test_arithmetic.py | 23 +- .../scalar/timestamp/test_comparisons.py | 52 ++ .../series/methods/test_convert_dtypes.py | 9 +- pandas/tests/series/methods/test_replace.py | 17 + pandas/tests/series/test_analytics.py | 32 - pandas/tests/series/test_datetime_values.py | 21 +- pandas/tests/test_strings.py | 9 + pandas/tests/tslibs/test_ccalendar.py | 25 +- pandas/util/_decorators.py | 21 +- requirements-dev.txt | 2 +- scripts/validate_rst_title_capitalization.py | 13 + scripts/validate_unwanted_patterns.py | 16 +- setup.cfg | 2 + setup.py | 4 +- web/pandas/about/citing.md | 46 +- 140 files changed, 2713 insertions(+), 877 deletions(-) create mode 100644 pandas/tests/generic/methods/test_dot.py create mode 100644 pandas/tests/generic/test_finalize.py create mode 100644 pandas/tests/groupby/test_apply_mutate.py diff --git a/README.md b/README.md index d66a5bc4a7ef1..33dfbf10ff743 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ # pandas: powerful Python data analysis toolkit [![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) [![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/master/LICENSE) [![Travis Build Status](https://travis-ci.org/pandas-dev/pandas.svg?branch=master)](https://travis-ci.org/pandas-dev/pandas) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7886b63e9983e..7c10a2d17775a 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -39,7 +39,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": [], + "Cython": ["0.29.16"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 8cbf8c8592661..103df0fd94847 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -9,6 +9,11 @@ def setup(self): self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) self.values_integer = np.array([1, 0, 1, 0]) self.values_integer_like = [1, 0, 1, 0] + self.data = np.array([True, False, True, False]) + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.BooleanArray(self.data, self.mask) def time_from_bool_array(self): pd.array(self.values_bool, dtype="boolean") @@ -21,3 +26,16 @@ def time_from_integer_like(self): def time_from_float_array(self): pd.array(self.values_float, dtype="boolean") + + +class IntegerArray: + def setup(self): + self.values_integer = np.array([1, 0, 1, 0]) + self.data = np.array([1, 2, 3, 4], dtype="int64") + self.mask = np.array([False, False, True, False]) + + def time_constructor(self): + pd.arrays.IntegerArray(self.data, self.mask) + + def time_from_integer_array(self): + pd.array(self.values_integer, dtype="Int64") diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8901efad56f79..45b7db74fa409 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -292,10 +292,6 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/generic.py RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests groupby.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/groupby/groupby.py -k"-cumcount -describe -pipe" - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests series.py' ; echo $MSG pytest -q --doctest-modules pandas/core/series.py RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -318,6 +314,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then pytest -q --doctest-modules pandas/core/dtypes/ RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests groupby' ; echo $MSG + pytest -q --doctest-modules pandas/core/groupby/ + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Doctests indexes' ; echo $MSG pytest -q --doctest-modules pandas/core/indexes/ RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 0e0ebe5c75218..e553330b962a2 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -22,7 +22,7 @@ dependencies: - numpy=1.13.3 - openpyxl=2.5.7 - pytables=3.4.2 - - python-dateutil=2.6.1 + - python-dateutil=2.7.3 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 29ebfe2639e32..17c3d318ce54d 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -14,7 +14,8 @@ dependencies: - pytz - pip - pip: - - cython>=0.29.16 + - cython==0.29.16 + # GH#33507 cython 3.0a1 is causing TypeErrors 2020-04-13 - "git+git://github.com/dateutil/dateutil.git" - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" - "--pre" diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 279f44b06bd02..93885afbc4114 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -23,7 +23,7 @@ dependencies: - openpyxl - pyarrow>=0.13.0 - pytables - - python-dateutil==2.6.1 + - python-dateutil==2.7.3 - pytz - xarray - xlrd diff --git a/doc/source/conf.py b/doc/source/conf.py index d24483abd28e1..d2404b757ca11 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -416,6 +416,7 @@ "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), "statsmodels": ("https://www.statsmodels.org/devel/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), } # extlinks alias diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bc1be527696a5..7fa2233e79fc0 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -221,7 +221,7 @@ Package Minimum support ================================================================ ========================== `setuptools `__ 24.2.0 `NumPy `__ 1.13.3 -`python-dateutil `__ 2.6.1 +`python-dateutil `__ 2.7.3 `pytz `__ 2017.2 ================================================================ ========================== diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d721e00a0a0b6..df6b44ac654ce 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -285,14 +285,18 @@ chunksize : int, default ``None`` Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. - Set to ``None`` for no decompression. + Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to + compression settings. As an example, the following could be passed for + faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. + .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -3347,6 +3351,12 @@ The compression type can be an explicit parameter or be inferred from the file e If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or ``'.xz'``, respectively. +The compression parameter can also be a ``dict`` in order to pass options to the +compression protocol. It must have a ``'method'`` key set to the name +of the compression protocol, which must be one of +{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to +the underlying compression library. + .. ipython:: python df = pd.DataFrame({ @@ -3383,6 +3393,15 @@ The default is to 'infer': rt = pd.read_pickle("s1.pkl.bz2") rt +Passing options to the compression protocol in order to speed up compression: + +.. ipython:: python + + df.to_pickle( + "data.pkl.gz", + compression={"method": "gzip", 'compresslevel': 1} + ) + .. ipython:: python :suppress: @@ -4583,17 +4602,15 @@ frames efficient, and to make sharing data across data analysis languages easy. Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas dtypes, including extension dtypes such as categorical and datetime with tz. -Several caveats. +Several caveats: -* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible - to the earlier versions. * The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an error if a non-default one is provided. You can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. * Duplicate column names and non-string columns names are not supported -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Actual Python objects in object dtype columns are not supported. These will + raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0d49a2d8db77c..6ba58310000cb 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -786,6 +786,15 @@ Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, as detailed in the section on :ref:`.dt accessors`. +.. versionadded:: 1.1.0 + +You may obtain the year, week and day components of the ISO year from the ISO 8601 standard: + +.. ipython:: python + + idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + idx.to_series().dt.isocalendar() + .. _timeseries.offsets: DateOffset objects diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 50333b54ca903..b5ac96752536e 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -3,7 +3,7 @@ {{ header }} ************* -Release Notes +Release notes ************* This is the list of changes to pandas between each release. For full details, diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 2e0442364b2f3..443250592a4a7 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0100: -v0.10.0 (December 17, 2012) ---------------------------- +Version 0.10.0 (December 17, 2012) +---------------------------------- {{ header }} @@ -490,7 +490,7 @@ Updated PyTables support however, query terms using the prior (undocumented) methodology are unsupported. You must read in the entire file and write it out using the new format to take advantage of the updates. -N dimensional Panels (experimental) +N dimensional panels (experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Adding experimental support for Panel4D and factory functions to create n-dimensional named panels. diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index c4251f70d85b6..1e9eafd2700e9 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0101: -v0.10.1 (January 22, 2013) ---------------------------- +Version 0.10.1 (January 22, 2013) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 148ee349b049c..6c13a125a4e54 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0110: -v0.11.0 (April 22, 2013) ------------------------- +Version 0.11.0 (April 22, 2013) +------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 823e177f3e05e..9e864f63c43e0 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0120: -v0.12.0 (July 24, 2013) ------------------------- +Version 0.12.0 (July 24, 2013) +------------------------------ {{ header }} @@ -177,8 +177,8 @@ API changes ``__repr__``). Plus string safety throughout. Now employed in many places throughout the pandas library. (:issue:`4090`, :issue:`4092`) -I/O enhancements -~~~~~~~~~~~~~~~~ +IO enhancements +~~~~~~~~~~~~~~~ - ``pd.read_html()`` can now parse HTML strings, files or urls and return DataFrames, courtesy of @cpcloud. (:issue:`3477`, :issue:`3605`, :issue:`3606`, :issue:`3616`). diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index de5e1986744fe..5a904d6c85c61 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0130: -v0.13.0 (January 3, 2014) ---------------------------- +Version 0.13.0 (January 3, 2014) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 4f9ab761334e7..6fe010be8fb2d 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0131: -v0.13.1 (February 3, 2014) --------------------------- +Version 0.13.1 (February 3, 2014) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index e8dc153d2b92b..9bdb1ecb544b7 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -473,7 +473,7 @@ Some other enhancements to the sql functions include: .. _whatsnew_0140.slicers: -MultiIndexing using slicers +Multiindexing using slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ In 0.14.0 we added a new way to slice MultiIndexed objects. diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index d3f96d4185d65..e371f1d9fe69a 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1197,7 +1197,7 @@ Performance improvements .. _whatsnew_0180.bug_fixes: -Bug Fixes +Bug fixes ~~~~~~~~~ - Bug in ``GroupBy.size`` when data-frame is empty. (:issue:`11699`) diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index f786ce513f6fe..2c6e8f0e27154 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -380,7 +380,7 @@ New behavior: .. _whatsnew_0181.numpy_compatibility: -numpy function compatibility +NumPy function compatibility ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Compatibility between pandas array-like methods (e.g. ``sum`` and ``take``) and their ``numpy`` diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 6eb509a258430..7390b80217b2c 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -377,7 +377,7 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci .. _whatsnew_0190.gbq: -Google BigQuery Enhancements +Google BigQuery enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the `docs `__ for more details (:issue:`13615`). @@ -385,7 +385,7 @@ Google BigQuery Enhancements .. _whatsnew_0190.errstate: -Fine-grained numpy errstate +Fine-grained NumPy errstate ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas code base. (:issue:`13109`, :issue:`13145`) @@ -1185,7 +1185,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument .. _whatsnew_0190.sparse: -Sparse Changes +Sparse changes ^^^^^^^^^^^^^^ These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ceb1c7f27231b..06bbd9679bb4d 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -356,7 +356,7 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you .. _whatsnew_0200.enhancements.style_excel: -Excel output for styled DataFrames +Excel output for styled dataframes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Experimental support has been added to export ``DataFrame.style`` formats to Excel using the ``openpyxl`` engine. (:issue:`15530`) @@ -813,7 +813,7 @@ New behavior: .. _whatsnew_0200.api_breaking.gbq: -Pandas Google BigQuery support has moved +pandas Google BigQuery support has moved ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ pandas has split off Google BigQuery support into a separate package ``pandas-gbq``. You can ``conda install pandas-gbq -c conda-forge`` or @@ -1289,7 +1289,7 @@ A new public ``pandas.plotting`` module has been added that holds plotting funct .. _whatsnew_0200.privacy.development: -Other Development Changes +Other development changes ^^^^^^^^^^^^^^^^^^^^^^^^^ - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c756bc87e9b89..45399792baecf 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -733,7 +733,7 @@ is the case with :attr:`Period.end_time`, for example .. _whatsnew_0240.api_breaking.datetime_unique: -Series.unique for Timezone-Aware Data +Series.unique for timezone-aware data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The return type of :meth:`Series.unique` for datetime with timezone values has changed @@ -1131,7 +1131,7 @@ data is incompatible with a passed ``dtype=`` (:issue:`15832`) .. _whatsnew_0240.api.concat_categorical: -Concatenation Changes +Concatenation changes ^^^^^^^^^^^^^^^^^^^^^ Calling :func:`pandas.concat` on a ``Categorical`` of ints with NA values now diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b18d022349001..44558fd63ba15 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -85,7 +85,7 @@ See :ref:`groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.multiple_lambdas: -Groupby Aggregation with multiple lambdas +Groupby aggregation with multiple lambdas ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now provide multiple lambda functions to a list-like aggregation in @@ -1243,7 +1243,7 @@ Sparse - Bug in :func:`numpy.modf` on a :class:`SparseArray`. Now a tuple of :class:`SparseArray` is returned (:issue:`26946`). -Build Changes +Build changes ^^^^^^^^^^^^^ - Fix install error with PyPy on macOS (:issue:`26536`) diff --git a/doc/source/whatsnew/v0.4.x.rst b/doc/source/whatsnew/v0.4.x.rst index 8e41e528f5b75..0ed7bb396674e 100644 --- a/doc/source/whatsnew/v0.4.x.rst +++ b/doc/source/whatsnew/v0.4.x.rst @@ -1,7 +1,7 @@ .. _whatsnew_04x: -v.0.4.1 through v0.4.3 (September 25 - October 9, 2011) -------------------------------------------------------- +Versions 0.4.1 through 0.4.3 (September 25 - October 9, 2011) +------------------------------------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 37c52ac7bb34e..7ccb141260f18 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -1,8 +1,8 @@ .. _whatsnew_050: -v.0.5.0 (October 24, 2011) --------------------------- +Version 0.5.0 (October 24, 2011) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 973ba897b3234..f984b9ad71b63 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_060: -v.0.6.0 (November 25, 2011) ---------------------------- +Version 0.6.0 (November 25, 2011) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index d01757775d694..8eea0a07f1f79 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -1,8 +1,8 @@ .. _whatsnew_061: -v.0.6.1 (December 13, 2011) ---------------------------- +Version 0.6.1 (December 13, 2011) +--------------------------------- New features ~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst index a63cd37e47dc2..a193b8049e951 100644 --- a/doc/source/whatsnew/v0.7.0.rst +++ b/doc/source/whatsnew/v0.7.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0700: -v.0.7.0 (February 9, 2012) --------------------------- +Version 0.7.0 (February 9, 2012) +-------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.7.1.rst b/doc/source/whatsnew/v0.7.1.rst index 04b548a93c338..7082ef8ed2882 100644 --- a/doc/source/whatsnew/v0.7.1.rst +++ b/doc/source/whatsnew/v0.7.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0701: -v.0.7.1 (February 29, 2012) ---------------------------- +Version 0.7.1 (February 29, 2012) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.7.2.rst b/doc/source/whatsnew/v0.7.2.rst index ad72b081e590c..e10a7b499549b 100644 --- a/doc/source/whatsnew/v0.7.2.rst +++ b/doc/source/whatsnew/v0.7.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0702: -v.0.7.2 (March 16, 2012) ---------------------------- +Version 0.7.2 (March 16, 2012) +------------------------------ {{ header }} diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 020cf3bdc2d59..5ed48c0d8d6d9 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -1,7 +1,7 @@ .. _whatsnew_0703: -v.0.7.3 (April 12, 2012) ------------------------- +Version 0.7.3 (April 12, 2012) +------------------------------ {{ header }} @@ -44,7 +44,7 @@ New features - Add ``kurt`` methods to Series and DataFrame for computing kurtosis -NA Boolean comparison API change +NA boolean comparison API change ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Reverted some changes to how NA values (represented typically as ``NaN`` or diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 072d1bae2a2b9..2a49315cc3b12 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_080: -v0.8.0 (June 29, 2012) ------------------------- +Version 0.8.0 (June 29, 2012) +----------------------------- {{ header }} @@ -42,7 +42,7 @@ Bug fixes to the 0.7.x series for legacy NumPy < 1.6 users will be provided as they arise. There will be no more further development in 0.7.x beyond bug fixes. -Time series changes and improvements +Time Series changes and improvements ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. note:: diff --git a/doc/source/whatsnew/v0.8.1.rst b/doc/source/whatsnew/v0.8.1.rst index 1e6b9746c85a5..a00a57a0a1cdb 100644 --- a/doc/source/whatsnew/v0.8.1.rst +++ b/doc/source/whatsnew/v0.8.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0801: -v0.8.1 (July 22, 2012) ----------------------- +Version 0.8.1 (July 22, 2012) +----------------------------- {{ header }} diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 3d9ff3c7a89fd..565b965c116db 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -3,8 +3,8 @@ {{ header }} -v0.9.0 (October 7, 2012) ------------------------- +Version 0.9.0 (October 7, 2012) +------------------------------- This is a major release from 0.8.1 and includes several new features and enhancements along with a large number of bug fixes. New features include diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index b8932ae2ae522..3b2924d175cdf 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0901: -v0.9.1 (November 14, 2012) --------------------------- +Version 0.9.1 (November 14, 2012) +--------------------------------- {{ header }} diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..4f0ca97310d85 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -15,7 +15,7 @@ including other versions of pandas. 1.0. -New Deprecation Policy +New deprecation policy ~~~~~~~~~~~~~~~~~~~~~~ Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to @@ -61,7 +61,7 @@ the :ref:`custom window rolling documentation ` .. _whatsnew_100.to_markdown: -Converting to Markdown +Converting to markdown ^^^^^^^^^^^^^^^^^^^^^^ We've added :meth:`~DataFrame.to_markdown` for creating a markdown table (:issue:`11052`) @@ -746,7 +746,7 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -Build Changes +Build changes ^^^^^^^^^^^^^ Pandas has added a `pyproject.toml `_ file and will no longer include @@ -778,7 +778,7 @@ Other API changes .. _whatsnew_100.api.documentation: -Documentation Improvements +Documentation improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c39377899a20..d0e3e5c96dc3a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,10 +88,32 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- +- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). +- The :meth:`DataFrame.to_feather` method now supports additional keyword + arguments (e.g. to set the compression) that are added in pyarrow 0.17 + (:issue:`33422`). +- :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_pickle`, + and :meth:`DataFrame.to_json` now support passing a dict of + compression arguments when using the ``gzip`` and ``bz2`` protocols. + This can be used to set a custom compression level, e.g., + ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` + (:issue:`33196`) .. --------------------------------------------------------------------------- +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`29766`, :issue:`29723`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| python-dateutil | 2.7.3 | X | | ++-----------------+-----------------+----------+---------+ + + Development Changes ^^^^^^^^^^^^^^^^^^^ @@ -356,7 +378,7 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). -- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`). +- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). .. --------------------------------------------------------------------------- @@ -375,6 +397,7 @@ Categorical - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) - :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) - Bug where :meth:`Categorical.replace` would replace with ``NaN`` whenever the new value and replacement value were equal (:issue:`33288`) +- Bug where an ordered :class:`Categorical` containing only ``NaN`` values would raise rather than returning ``NaN`` when taking the minimum or maximum (:issue:`33450`) Datetimelike ^^^^^^^^^^^^ @@ -386,7 +409,10 @@ Datetimelike - :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) - Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) - Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) +- Bug where :meth:`PeriodIndex` raised when passed a :class:`Series` of strings (:issue:`26109`) - Bug in :class:`Timestamp` arithmetic when adding or subtracting a ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) +- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`) + Timedelta ^^^^^^^^^ @@ -395,6 +421,7 @@ Timedelta - Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`) - Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) +- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) Timezones ^^^^^^^^^ @@ -410,6 +437,8 @@ Numeric - Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) +- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) +- Conversion ^^^^^^^^^^ @@ -421,12 +450,12 @@ Strings ^^^^^^^ - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). -- +- Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`) Interval ^^^^^^^^ -- +- Bug in :class:`IntervalArray` incorrectly allowing the underlying data to be changed when setting values (:issue:`32782`) - Indexing @@ -447,12 +476,14 @@ Indexing - Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) - Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) - Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) +- Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) +- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) Missing ^^^^^^^ - - Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). - +- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) MultiIndex ^^^^^^^^^^ @@ -502,8 +533,9 @@ Plotting ^^^^^^^^ - :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). -- +- Bug in :meth:`DataFrame.plot.hist` where weights are not working for multiple columns (:issue:`33173`) - Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) +- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) Groupby/resample/rolling @@ -517,6 +549,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` returning floats when applied to nullable Booleans (:issue:`33071`) - Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) +- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) Reshaping ^^^^^^^^^ @@ -563,6 +596,7 @@ Other - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) +- Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) diff --git a/environment.yml b/environment.yml index c874c5a8f68da..67b2df4dc5a0e 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ dependencies: # required - numpy>=1.15 - python=3 - - python-dateutil>=2.6.1 + - python-dateutil>=2.7.3 - pytz # benchmarks diff --git a/pandas/_config/config.py b/pandas/_config/config.py index df706bf25097e..8955a06187109 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -51,20 +51,11 @@ from collections import namedtuple from contextlib import contextmanager import re -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, - TypeVar, - cast, -) +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type, cast import warnings +from pandas._typing import F + DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") @@ -704,9 +695,6 @@ def pp(name: str, ks: Iterable[str]) -> List[str]: # # helpers -FuncType = Callable[..., Any] -F = TypeVar("F", bound=FuncType) - @contextmanager def config_prefix(prefix): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f8f3858b803a5..6e5509a5570e8 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -206,7 +206,7 @@ def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}({{c_type}}[:] arr, {{c_type}}[:] values): +def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): {{endif}} """ Return boolean of values in arr on an diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 276c2d5198831..bbb4d562b8971 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1,9 +1,6 @@ from collections import abc from decimal import Decimal -from fractions import Fraction -from numbers import Number -import sys import warnings import cython @@ -1001,34 +998,34 @@ cdef inline bint c_is_list_like(object obj, bint allow_sets): _TYPE_MAP = { - 'categorical': 'categorical', - 'category': 'categorical', - 'int8': 'integer', - 'int16': 'integer', - 'int32': 'integer', - 'int64': 'integer', - 'i': 'integer', - 'uint8': 'integer', - 'uint16': 'integer', - 'uint32': 'integer', - 'uint64': 'integer', - 'u': 'integer', - 'float32': 'floating', - 'float64': 'floating', - 'f': 'floating', - 'complex64': 'complex', - 'complex128': 'complex', - 'c': 'complex', - 'string': 'string', - 'S': 'bytes', - 'U': 'string', - 'bool': 'boolean', - 'b': 'boolean', - 'datetime64[ns]': 'datetime64', - 'M': 'datetime64', - 'timedelta64[ns]': 'timedelta64', - 'm': 'timedelta64', - 'interval': 'interval', + "categorical": "categorical", + "category": "categorical", + "int8": "integer", + "int16": "integer", + "int32": "integer", + "int64": "integer", + "i": "integer", + "uint8": "integer", + "uint16": "integer", + "uint32": "integer", + "uint64": "integer", + "u": "integer", + "float32": "floating", + "float64": "floating", + "f": "floating", + "complex64": "complex", + "complex128": "complex", + "c": "complex", + "string": "string", + "S": "bytes", + "U": "string", + "bool": "boolean", + "b": "boolean", + "datetime64[ns]": "datetime64", + "M": "datetime64", + "timedelta64[ns]": "timedelta64", + "m": "timedelta64", + "interval": "interval", } # types only exist on certain platform @@ -1173,12 +1170,13 @@ cdef class Seen: or self.nat_) -cdef _try_infer_map(v): +cdef object _try_infer_map(object v): """ If its in our map, just return the dtype. """ cdef: - object attr, val + object val + str attr for attr in ['name', 'kind', 'base']: val = getattr(v.dtype, attr) if val in _TYPE_MAP: diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index e74b5919a4590..aed5e1d612088 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -36,7 +36,7 @@ ctypedef fused reshape_t: @cython.wraparound(False) @cython.boundscheck(False) -def unstack(reshape_t[:, :] values, uint8_t[:] mask, +def unstack(reshape_t[:, :] values, const uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, reshape_t[:, :] new_values, uint8_t[:, :] new_mask): """ diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 8d3b00e4a44b9..4a4e53eaa45fa 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,4 +1,21 @@ -# flake8: noqa +__all__ = [ + "localize_pydatetime", + "normalize_date", + "NaT", + "NaTType", + "iNaT", + "is_null_datetimelike", + "OutOfBoundsDatetime", + "IncompatibleFrequency", + "Period", + "Timedelta", + "delta_to_nanoseconds", + "ints_to_pytimedelta", + "Timestamp", + "tz_convert_single", + "NullFrequencyError", +] + from .conversion import localize_pydatetime, normalize_date from .nattype import NaT, NaTType, iNaT, is_null_datetimelike diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 04fadf220388f..68987030e8b4e 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -114,6 +114,18 @@ cdef class _Timestamp(datetime): return NotImplemented elif is_array(other): # avoid recursion error GH#15183 + if other.dtype.kind == "M": + if self.tz is None: + return PyObject_RichCompare(self.asm8, other, op) + raise TypeError( + "Cannot compare tz-naive and tz-aware timestamps" + ) + if other.dtype.kind == "O": + # Operate element-wise + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) else: diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 59ecaaaf2266e..68ad1d1e68133 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -2,9 +2,11 @@ from cython cimport Py_ssize_t from numpy cimport int64_t, int32_t +ctypedef (int32_t, int32_t, int32_t) iso_calendar_t cdef int dayofweek(int y, int m, int d) nogil cdef bint is_leapyear(int64_t year) nogil cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil +cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil cpdef int32_t get_day_of_year(int year, int month, int day) nogil diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 0588dfe20e2e2..0873084d29555 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -150,33 +150,65 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: ------- week_of_year : int32_t + Notes + ----- + Assumes the inputs describe a valid date. + """ + return get_iso_calendar(year, month, day)[1] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil: + """ + Return the year, week, and day of year corresponding to ISO 8601 + + Parameters + ---------- + year : int + month : int + day : int + + Returns + ------- + year : int32_t + week : int32_t + day : int32_t + Notes ----- Assumes the inputs describe a valid date. """ cdef: int32_t doy, dow - int woy + int32_t iso_year, iso_week doy = get_day_of_year(year, month, day) dow = dayofweek(year, month, day) # estimate - woy = (doy - 1) - dow + 3 - if woy >= 0: - woy = woy // 7 + 1 + iso_week = (doy - 1) - dow + 3 + if iso_week >= 0: + iso_week = iso_week // 7 + 1 # verify - if woy < 0: - if (woy > -2) or (woy == -2 and is_leapyear(year - 1)): - woy = 53 + if iso_week < 0: + if (iso_week > -2) or (iso_week == -2 and is_leapyear(year - 1)): + iso_week = 53 else: - woy = 52 - elif woy == 53: + iso_week = 52 + elif iso_week == 53: if 31 - day + dow < 3: - woy = 1 + iso_week = 1 + + iso_year = year + if iso_week == 1 and doy > 7: + iso_year += 1 + + elif iso_week >= 52 and doy < 7: + iso_year -= 1 - return woy + return iso_year, iso_week, dow + 1 @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 50b7fba67e78f..184d368659714 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -8,14 +8,14 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t +from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t cnp.import_array() from pandas._libs.tslibs.ccalendar import ( get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS) from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year) + get_day_of_year, get_iso_calendar, iso_calendar_t) from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) @@ -670,3 +670,42 @@ cpdef isleapyear_arr(ndarray years): np.logical_and(years % 4 == 0, years % 100 > 0))] = 1 return out.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def build_isocalendar_sarray(const int64_t[:] dtindex): + """ + Given a int64-based datetime array, return the ISO 8601 year, week, and day + as a structured array. + """ + cdef: + Py_ssize_t i, count = len(dtindex) + npy_datetimestruct dts + ndarray[uint32_t] iso_years, iso_weeks, days + iso_calendar_t ret_val + + sa_dtype = [ + ("year", "u4"), + ("week", "u4"), + ("day", "u4"), + ] + + out = np.empty(count, dtype=sa_dtype) + + iso_years = out["year"] + iso_weeks = out["week"] + days = out["day"] + + with nogil: + for i in range(count): + if dtindex[i] == NPY_NAT: + ret_val = 0, 0, 0 + else: + dt64_to_dtstruct(dtindex[i], &dts) + ret_val = get_iso_calendar(dts.year, dts.month, dts.day) + + iso_years[i] = ret_val[0] + iso_weeks[i] = ret_val[1] + days[i] = ret_val[2] + return out diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 3af2279e2440f..c5092c8630f06 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -778,36 +778,32 @@ cdef class _Timedelta(timedelta): if isinstance(other, _Timedelta): ots = other - elif PyDelta_Check(other) or isinstance(other, Tick): + elif (is_timedelta64_object(other) or PyDelta_Check(other) + or isinstance(other, Tick)): ots = Timedelta(other) - else: - ndim = getattr(other, "ndim", -1) + # TODO: watch out for overflows - if ndim != -1: - if ndim == 0: - if is_timedelta64_object(other): - other = Timedelta(other) - else: - if op == Py_EQ: - return False - elif op == Py_NE: - return True - # only allow ==, != ops - raise TypeError(f'Cannot compare type ' - f'{type(self).__name__} with ' - f'type {type(other).__name__}') - if util.is_array(other): - return PyObject_RichCompare(np.array([self]), other, op) - return PyObject_RichCompare(other, self, reverse_ops[op]) - else: - if other is NaT: - return PyObject_RichCompare(other, self, reverse_ops[op]) - elif op == Py_EQ: - return False - elif op == Py_NE: - return True - raise TypeError(f'Cannot compare type {type(self).__name__} with ' - f'type {type(other).__name__}') + elif other is NaT: + return op == Py_NE + + elif util.is_array(other): + # TODO: watch out for zero-dim + if other.dtype.kind == "m": + return PyObject_RichCompare(self.asm8, other, op) + elif other.dtype.kind == "O": + # operate element-wise + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) + if op == Py_EQ: + return np.zeros(other.shape, dtype=bool) + elif op == Py_NE: + return np.ones(other.shape, dtype=bool) + return NotImplemented # let other raise TypeError + + else: + return NotImplemented return cmp_scalar(self.value, ots.value, op) diff --git a/pandas/_typing.py b/pandas/_typing.py index e1b6a5e2e6876..850f10bd7f811 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -75,3 +75,7 @@ # to maintain type information across generic functions and parametrization T = TypeVar("T") +# used in decorators to preserve the signature of the function it decorates +# see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3547a33ea357b..6570e0782a69a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,6 +12,8 @@ import sys import warnings +from pandas._typing import F + PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" @@ -25,7 +27,7 @@ # found at https://bitbucket.org/gutworth/six -def set_function_name(f, name, cls): +def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. """ diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index b3723340cefd6..1b9ed014f27b7 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -3,6 +3,8 @@ for missing values. """ +from typing import Callable + import numpy as np from pandas._libs import missing as libmissing @@ -11,14 +13,19 @@ from pandas.core.nanops import check_below_min_count -def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, +def _sumprod( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + skipna: bool = True, + min_count: int = 0, ): """ - Sum for 1D masked array. + Sum or product for 1D masked array. Parameters ---------- + func : np.sum or np.prod values : np.ndarray Numpy array with the values (can be of any dtype that support the operation). @@ -31,23 +38,33 @@ def sum( ``min_count`` non-NA values are present the result will be NA. """ if not skipna: - if mask.any(): + if mask.any() or check_below_min_count(values.shape, None, min_count): return libmissing.NA else: - if check_below_min_count(values.shape, None, min_count): - return libmissing.NA - return np.sum(values) + return func(values) else: if check_below_min_count(values.shape, mask, min_count): return libmissing.NA if _np_version_under1p17: - return np.sum(values[~mask]) + return func(values[~mask]) else: - return np.sum(values, where=~mask) + return func(values, where=~mask) + + +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) -def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return _sumprod( + np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count + ) + + +def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): """ Reduction for 1D masked array. @@ -63,18 +80,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True): Whether to skip NA. """ if not skipna: - if mask.any(): + if mask.any() or not values.size: + # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA else: - if values.size: - return func(values) - else: - # min/max with empty array raise in numpy, pandas returns NA - return libmissing.NA + return func(values) else: subset = values[~mask] if subset.size: - return func(values[~mask]) + return func(subset) else: # min/max with empty array raise in numpy, pandas returns NA return libmissing.NA diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6cb597ba75852..7447d593a7ff0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1206,7 +1206,7 @@ def _maybe_convert(arr): return _maybe_convert(res) - op_name = ops._get_op_name(op, True) + op_name = f"__{op.__name__}__" return set_function_name(_binop, op_name, cls) @classmethod diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e85534def6b97..685a9ec48228f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -24,7 +24,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions @@ -271,18 +271,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( "values should be boolean numpy array. Use " - "the 'array' function instead" + "the 'pd.array' function instead" ) - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'array' function instead" - ) - if not values.ndim == 1: - raise ValueError("values must be a 1D array") - if not mask.ndim == 1: - raise ValueError("mask must be a 1D array") - self._dtype = BooleanDtype() super().__init__(values, mask, copy=copy) @@ -520,7 +510,7 @@ def any(self, skipna: bool = True, **kwargs): if skipna: return result else: - if result or len(self) == 0: + if result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value @@ -587,7 +577,7 @@ def all(self, skipna: bool = True, **kwargs): if skipna: return result else: - if not result or len(self) == 0: + if not result or len(self) == 0 or not self._mask.any(): return result else: return self.dtype.na_value @@ -696,7 +686,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -710,12 +700,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if np.isnan(result): return libmissing.NA - # if we have numeric op that would result in an int, coerce to int if possible - if name == "prod" and notna(result): - int_result = np.int64(result) - if int_result == result: - result = int_result - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c9b8db28e0cf6..b3fb3459891e0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2143,7 +2143,7 @@ def min(self, skipna=True): good = self._codes != -1 if not good.all(): - if skipna: + if skipna and good.any(): pointer = self._codes[good].min() else: return np.nan @@ -2178,7 +2178,7 @@ def max(self, skipna=True): good = self._codes != -1 if not good.all(): - if skipna: + if skipna and good.any(): pointer = self._codes[good].max() else: return np.nan diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4fabd8f558fee..30a34282889f8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -574,6 +574,8 @@ def __getitem__(self, key): freq = self.freq result = getitem(key) + if lib.is_scalar(result): + return self._box_func(result) return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( @@ -784,7 +786,7 @@ def shift(self, periods=1, fill_value=None, axis=0): "will raise in a future version, pass " f"{self._scalar_type.__name__} instead.", FutureWarning, - stacklevel=7, + stacklevel=9, ) fill_value = new_fill diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b9f9edcebad5b..f5cc0817e8bd7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -18,6 +18,7 @@ timezones, tzconversion, ) +import pandas._libs.tslibs.frequencies as libfrequencies from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( @@ -1097,7 +1098,14 @@ def to_period(self, freq=None): "You must pass a freq argument as current index has none." ) - freq = get_period_alias(freq) + res = get_period_alias(freq) + + # https://github.com/pandas-dev/pandas/issues/33358 + if res is None: + base, stride = libfrequencies._base_and_stride(freq) + res = f"{stride}{base}" + + freq = res return PeriodArray._from_datetime64(self._data, freq, tz=self.tz) @@ -1234,6 +1242,49 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day calculated according to + the ISO 8601 standard. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame + with columns year, week and day + + See Also + -------- + Timestamp.isocalendar + datetime.date.isocalendar + + Examples + -------- + >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + >>> idx.isocalendar() + year week day + 0 2019 52 7 + 1 2020 1 1 + 2 2020 1 2 + 3 2020 1 3 + >>> idx.isocalendar().week + 0 52 + 1 1 + 2 1 + 3 1 + Name: week, dtype: UInt32 + """ + from pandas import DataFrame + + sarray = fields.build_isocalendar_sarray(self.asi8) + iso_calendar_df = DataFrame( + sarray, columns=["year", "week", "day"], dtype="UInt32" + ) + if self._hasnans: + iso_calendar_df.iloc[self._isnan] = None + return iso_calendar_df + year = _field_accessor( "year", "Y", diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d47a396bbb14e..5605b3fbc5dfa 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -28,7 +28,6 @@ from pandas.core import nanops, ops from pandas.core.array_algos import masked_reductions -import pandas.core.common as com from pandas.core.indexers import check_array_indexer from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer @@ -343,15 +342,10 @@ def dtype(self) -> _IntegerDtype: return _dtypes[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( "values should be integer numpy array. Use " - "the 'integer_array' function instead" - ) - if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): - raise TypeError( - "mask should be boolean numpy array. Use " - "the 'integer_array' function instead" + "the 'pd.array' function instead" ) super().__init__(values, mask, copy=copy) @@ -562,7 +556,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "min", "max"}: + if name in {"sum", "prod", "min", "max"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) @@ -577,16 +571,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): if np.isnan(result): return libmissing.NA - # if we have a boolean op, don't coerce - if name in ["any", "all"]: - pass - - # if we have a preservable numeric op, - # provide coercion back to an integer type if possible - elif name == "prod": - # GH#31409 more performant than casting-then-checking - result = com.cast_scalar_indexer(result) - return result def _maybe_mask_result(self, result, mask, other, op_name: str): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 22ce5a6f87a43..220b70ff71b28 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -542,19 +542,19 @@ def __setitem__(self, key, value): msg = f"'value' should be an interval type, got {type(value)} instead." raise TypeError(msg) from err + if needs_float_conversion: + raise ValueError("Cannot set float NaN to integer-backed IntervalArray") + key = check_array_indexer(self, key) + # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) - if needs_float_conversion: - left = left.astype("float") - left.values[key] = value_left + left._values[key] = value_left self._left = left right = self.right.copy(deep=True) - if needs_float_conversion: - right = right.astype("float") - right.values[key] = value_right + right._values[key] = value_right self._right = right def __eq__(self, other): diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d23d26d870f75..fc5b307bd5754 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -30,6 +30,17 @@ class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): _internal_fill_value: Scalar def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + # values is supposed to already be validated in the subclass + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'pd.array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + if copy: values = values.copy() mask = mask.copy() diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 39a3b553b3cf4..99d9d69d66ec2 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -831,11 +831,11 @@ def period_array( """ if is_datetime64_dtype(data): return PeriodArray._from_datetime64(data, freq) - if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)): + if is_period_dtype(data): return PeriodArray(data, freq) # other iterable of some kind - if not isinstance(data, (np.ndarray, list, tuple)): + if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): data = list(data) data = np.asarray(data) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index c59952bea8dc0..6cd9a15b70d39 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -635,8 +635,9 @@ def visit_Attribute(self, node, **kwargs): # something like datetime.datetime where scope is overridden if isinstance(value, ast.Name) and value.id == attr: return resolved + raise - raise ValueError(f"Invalid Attribute context {ctx.__name__}") + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") def visit_Call(self, node, side=None, **kwargs): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3018861bce57..d19f1a263f71a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,6 +23,7 @@ FrozenSet, Hashable, Iterable, + Iterator, List, Optional, Sequence, @@ -40,7 +41,16 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib, properties -from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer +from pandas._typing import ( + ArrayLike, + Axes, + Axis, + Dtype, + FilePathOrBuffer, + Label, + Level, + Renamer, +) from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -600,6 +610,16 @@ def _is_homogeneous_type(self) -> bool: else: return not self._mgr.is_mixed_type + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + if self._data.any_extension_types: + # TODO(EA2D) special case would be unnecessary with 2D EAs + return False + return len(self._data.blocks) == 1 + # ---------------------------------------------------------------------- # Rendering Methods @@ -2063,18 +2083,24 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path) -> None: + def to_feather(self, path, **kwargs) -> None: """ - Write out the binary feather-format for DataFrames. + Write a DataFrame to the binary Feather format. Parameters ---------- path : str String file path. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + Starting with pyarrow 0.17, this includes the `compression`, + `compression_level`, `chunksize` and `version` keywords. + + .. versionadded:: 1.1.0 """ from pandas.io.feather_format import to_feather - to_feather(self, path) + to_feather(self, path, **kwargs) @Appender( """ @@ -2567,6 +2593,21 @@ def _ixs(self, i: int, axis: int = 0): return result + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) + """ + return self._data.iget_values(i) + + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). + """ + for i in range(len(self.columns)): + yield self._get_column_array(i) + def __getitem__(self, key): key = lib.item_from_zerodim(key) key = com.apply_if_callable(key, self) @@ -3630,7 +3671,7 @@ def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": fill_value=fill_value, ) - @Appender(_shared_docs["align"] % _shared_doc_kwargs) + @doc(NDFrame.align, **_shared_doc_kwargs) def align( self, other, @@ -4011,7 +4052,7 @@ def fillna( downcast=downcast, ) - @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, to_replace=None, @@ -4065,7 +4106,7 @@ def _replace_columnwise( return return res.__finalize__(self) - @Appender(_shared_docs["shift"] % _shared_doc_kwargs) + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value @@ -4703,6 +4744,47 @@ def drop_duplicates( See Also -------- DataFrame.value_counts: Count unique combinations of columns. + + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based on all columns. + + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + To remove duplicates on specific column(s), use ``subset``. + + >>> df.drop_duplicates(subset=['brand']) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurences, use ``keep``. + + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 """ if self.empty: return self.copy() @@ -6768,6 +6850,11 @@ def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": 5 NaN NaN NaN """ bm_axis = self._get_block_manager_axis(axis) + self._consolidate_inplace() + + if bm_axis == 0 and periods != 0: + return self.T.diff(periods, axis=0).T + new_data = self._mgr.diff(n=periods, axis=bm_axis) return self._constructor(new_data) @@ -8025,8 +8112,12 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type - dtype_is_dt = self.dtypes.apply( - lambda x: is_datetime64_any_dtype(x) or is_period_dtype(x) + dtype_is_dt = np.array( + [ + is_datetime64_any_dtype(values.dtype) or is_period_dtype(values.dtype) + for values in self._iter_column_arrays() + ], + dtype=bool, ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): warnings.warn( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 47a2b22abe103..6a4f83427310e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3096,7 +3096,8 @@ def to_csv( compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is 'zip' or inferred as 'zip', other entries passed as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -3105,6 +3106,12 @@ def to_csv( and other entries as additional compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is + supported for compression modes 'gzip' and 'bz2' + as well as 'zip'. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -5538,6 +5545,24 @@ def astype( 0 10 1 2 dtype: int64 + + Create a series of dates: + + >>> ser_date = pd.Series(pd.date_range('20200101', periods=3)) + >>> ser_date + 0 2020-01-01 + 1 2020-01-02 + 2 2020-01-03 + dtype: datetime64[ns] + + Datetimes are localized to UTC first before + converting to the specified timezone: + + >>> ser_date.astype('datetime64[ns, US/Eastern]') + 0 2019-12-31 19:00:00-05:00 + 1 2020-01-01 19:00:00-05:00 + 2 2020-01-02 19:00:00-05:00 + dtype: datetime64[ns, US/Eastern] """ if is_dict_like(dtype): if self.ndim == 1: # i.e. Series @@ -5709,7 +5734,6 @@ def _convert( numeric: bool_t = False, timedelta: bool_t = False, coerce: bool_t = False, - copy: bool_t = True, ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -5726,10 +5750,6 @@ def _convert( coerce : bool, default False If True, force conversion with unconvertible values converted to nulls (NaN or NaT). - copy : bool, default True - If True, return a copy even if no copy is necessary (e.g. no - conversion was done). Note: This is meant for internal use, and - should not be confused with inplace. Returns ------- @@ -5739,14 +5759,13 @@ def _convert( validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") validate_bool_kwarg(coerce, "coerce") - validate_bool_kwarg(copy, "copy") return self._constructor( self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, coerce=coerce, - copy=copy, + copy=True, ) ).__finalize__(self) @@ -6162,12 +6181,20 @@ def bfill( method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast ) - _shared_docs[ - "replace" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + """ Replace values given in `to_replace` with `value`. - Values of the %(klass)s are replaced with other values dynamically. + Values of the {klass} are replaced with other values dynamically. This differs from updating with ``.loc`` or ``.iloc``, which require you to specify a location to update with some value. @@ -6199,19 +6226,19 @@ def bfill( - Dicts can be used to specify different replacement values for different existing values. For example, - ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and + ``{{'a': 'b', 'y': 'z'}}`` replaces the value 'a' with 'b' and 'y' with 'z'. To use a dict in this way the `value` parameter should be `None`. - For a DataFrame a dict can specify that different values should be replaced in different columns. For example, - ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a' + ``{{'a': 1, 'b': 'z'}}`` looks for the value 1 in column 'a' and the value 'z' in column 'b' and replaces these values with whatever is specified in `value`. The `value` parameter should not be ``None`` in this case. You can treat this as a special case of passing two lists except that you are specifying the column to search in. - For a DataFrame nested dictionaries, e.g., - ``{'a': {'b': np.nan}}``, are read as follows: look in column + ``{{'a': {{'b': np.nan}}}}``, are read as follows: look in column 'a' for the value 'b' and replace it with NaN. The `value` parameter should be ``None`` to use a nested dict in this way. You can nest regular expressions as well. Note that @@ -6244,7 +6271,7 @@ def bfill( string. Alternatively, this could be a regular expression or a list, dict, or array of regular expressions in which case `to_replace` must be ``None``. - method : {'pad', 'ffill', 'bfill', `None`} + method : {{'pad', 'ffill', 'bfill', `None`}} The method to use when for replacement, when `to_replace` is a scalar, list or tuple and `value` is ``None``. @@ -6253,7 +6280,7 @@ def bfill( Returns ------- - %(klass)s + {klass} Object after replacement. Raises @@ -6279,8 +6306,8 @@ def bfill( See Also -------- - %(klass)s.fillna : Fill NA values. - %(klass)s.where : Replace values based on boolean condition. + {klass}.fillna : Fill NA values. + {klass}.where : Replace values based on boolean condition. Series.str.replace : Simple string replacement. Notes @@ -6312,9 +6339,9 @@ def bfill( 4 4 dtype: int64 - >>> df = pd.DataFrame({'A': [0, 1, 2, 3, 4], + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], ... 'B': [5, 6, 7, 8, 9], - ... 'C': ['a', 'b', 'c', 'd', 'e']}) + ... 'C': ['a', 'b', 'c', 'd', 'e']}}) >>> df.replace(0, 5) A B C 0 5 5 a @@ -6351,7 +6378,7 @@ def bfill( **dict-like `to_replace`** - >>> df.replace({0: 10, 1: 100}) + >>> df.replace({{0: 10, 1: 100}}) A B C 0 10 5 a 1 100 6 b @@ -6359,7 +6386,7 @@ def bfill( 3 3 8 d 4 4 9 e - >>> df.replace({'A': 0, 'B': 5}, 100) + >>> df.replace({{'A': 0, 'B': 5}}, 100) A B C 0 100 100 a 1 1 6 b @@ -6367,7 +6394,7 @@ def bfill( 3 3 8 d 4 4 9 e - >>> df.replace({'A': {0: 100, 4: 400}}) + >>> df.replace({{'A': {{0: 100, 4: 400}}}}) A B C 0 100 5 a 1 1 6 b @@ -6377,15 +6404,15 @@ def bfill( **Regular expression `to_replace`** - >>> df = pd.DataFrame({'A': ['bat', 'foo', 'bait'], - ... 'B': ['abc', 'bar', 'xyz']}) + >>> df = pd.DataFrame({{'A': ['bat', 'foo', 'bait'], + ... 'B': ['abc', 'bar', 'xyz']}}) >>> df.replace(to_replace=r'^ba.$', value='new', regex=True) A B 0 new abc 1 foo new 2 bait xyz - >>> df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) + >>> df.replace({{'A': r'^ba.$'}}, {{'A': 'new'}}, regex=True) A B 0 new abc 1 foo bar @@ -6397,7 +6424,7 @@ def bfill( 1 foo new 2 bait xyz - >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}) + >>> df.replace(regex={{r'^ba.$': 'new', 'foo': 'xyz'}}) A B 0 new abc 1 xyz new @@ -6413,9 +6440,9 @@ def bfill( the data types in the `to_replace` parameter must match the data type of the value being replaced: - >>> df = pd.DataFrame({'A': [True, False, True], - ... 'B': [False, True, False]}) - >>> df.replace({'a string': 'new value', True: False}) # raises + >>> df = pd.DataFrame({{'A': [True, False, True], + ... 'B': [False, True, False]}}) + >>> df.replace({{'a string': 'new value', True: False}}) # raises Traceback (most recent call last): ... TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' @@ -6423,7 +6450,7 @@ def bfill( This raises a ``TypeError`` because one of the ``dict`` keys is not of the correct type for replacement. - Compare the behavior of ``s.replace({'a': None})`` and + Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: @@ -6431,10 +6458,10 @@ def bfill( When one uses a dict as the `to_replace` value, it is like the value(s) in the dict are equal to the `value` parameter. - ``s.replace({'a': None})`` is equivalent to - ``s.replace(to_replace={'a': None}, value=None, method=None)``: + ``s.replace({{'a': None}})`` is equivalent to + ``s.replace(to_replace={{'a': None}}, value=None, method=None)``: - >>> s.replace({'a': None}) + >>> s.replace({{'a': None}}) 0 10 1 None 2 None @@ -6457,17 +6484,6 @@ def bfill( 4 b dtype: object """ - - @Appender(_shared_docs["replace"] % _shared_doc_kwargs) - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method="pad", - ): if not ( is_scalar(to_replace) or is_re_compilable(to_replace) @@ -8246,9 +8262,21 @@ def ranker(data): return ranker(data) - _shared_docs[ - "align" - ] = """ + @doc(**_shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + """ Align two objects on their axes with the specified join method. Join method is specified for each axis Index. @@ -8256,7 +8284,7 @@ def ranker(data): Parameters ---------- other : DataFrame or Series - join : {'outer', 'inner', 'left', 'right'}, default 'outer' + join : {{'outer', 'inner', 'left', 'right'}}, default 'outer' axis : allowed axis of the other object, default None Align on index (0), columns (1), or both (None). level : int or level name, default None @@ -8268,7 +8296,7 @@ def ranker(data): fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any "compatible" value. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None Method to use for filling holes in reindexed Series: - pad / ffill: propagate last valid observation forward to next valid. @@ -8281,32 +8309,18 @@ def ranker(data): be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - fill_axis : %(axes_single_arg)s, default 0 + fill_axis : {axes_single_arg}, default 0 Filling axis, method and limit. - broadcast_axis : %(axes_single_arg)s, default None + broadcast_axis : {axes_single_arg}, default None Broadcast values along this axis, if aligning two objects of different dimensions. Returns ------- - (left, right) : (%(klass)s, type of other) + (left, right) : ({klass}, type of other) Aligned objects. """ - @Appender(_shared_docs["align"] % _shared_doc_kwargs) - def align( - self, - other, - join="outer", - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None, - ): method = missing.clean_fill_method(method) if broadcast_axis == 1 and self.ndim != other.ndim: @@ -8850,9 +8864,11 @@ def mask( errors=errors, ) - _shared_docs[ - "shift" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: + """ Shift index by desired number of periods with an optional time `freq`. When `freq` is not passed, shift the index without realigning the data. @@ -8869,7 +8885,7 @@ def mask( If `freq` is specified then the index values are shifted but the data is not realigned. That is, use `freq` if you would like to extend the index when shifting and preserve the original data. - axis : {0 or 'index', 1 or 'columns', None}, default None + axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional The scalar value to use for newly introduced missing values. @@ -8882,7 +8898,7 @@ def mask( Returns ------- - %(klass)s + {klass} Copy of input object, shifted. See Also @@ -8895,9 +8911,9 @@ def mask( Examples -------- - >>> df = pd.DataFrame({'Col1': [10, 20, 15, 30, 45], + >>> df = pd.DataFrame({{'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], - ... 'Col3': [17, 27, 22, 37, 52]}) + ... 'Col3': [17, 27, 22, 37, 52]}}) >>> df.shift(periods=3) Col1 Col2 Col3 @@ -8922,12 +8938,7 @@ def mask( 2 0 0 0 3 10 13 17 4 20 23 27 - """ - - @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift( - self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None - ) -> FrameOrSeries: + """ if periods == 0: return self.copy() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 208cbfc5b06d6..13938c41a0f6b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -29,7 +29,7 @@ import numpy as np -from pandas._libs import Timestamp, lib +from pandas._libs import lib from pandas._typing import FrameOrSeries from pandas.util._decorators import Appender, Substitution, doc @@ -388,7 +388,7 @@ def _wrap_aggregated_output( result = self._wrap_series_output( output=output, index=self.grouper.result_index ) - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -833,10 +833,13 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ Examples -------- - - >>> df = pd.DataFrame({'A': [1, 1, 2, 2], - ... 'B': [1, 2, 3, 4], - ... 'C': np.random.randn(4)}) + >>> df = pd.DataFrame( + ... { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + ... ) >>> df A B C @@ -876,7 +879,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): B C min max sum A - 1 1 2 0.590716 + 1 1 2 0.590715 2 3 4 0.704907 To control the output names with different aggregations per column, @@ -887,8 +890,9 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) b_min c_sum A - 1 1 -1.956929 - 2 3 -0.322183 + 1 1 0.590715 + 2 3 0.704907 + - The keywords are the *output* column names - The values are tuples whose first element is the column to select @@ -1193,20 +1197,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_names = self.grouper.names - # GH12824. - def first_not_none(values): - try: - return next(com.not_none(*values)) - except StopIteration: - return None + # GH12824 + first_not_none = next(com.not_none(*values), None) - v = first_not_none(values) - - if v is None: + if first_not_none is None: # GH9684. If all values are None, then this will throw an error. # We'd prefer it return an empty dataframe. return DataFrame() - elif isinstance(v, DataFrame): + elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: @@ -1223,6 +1221,9 @@ def first_not_none(values): # reorder the values values = [values[i] for i in indexer] + + # update due to the potential reorder + first_not_none = next(com.not_none(*values), None) else: key_index = Index(keys, name=key_names[0]) @@ -1232,20 +1233,19 @@ def first_not_none(values): key_index = None # make Nones an empty object - v = first_not_none(values) - if v is None: + if first_not_none is None: return DataFrame() - elif isinstance(v, NDFrame): + elif isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object - kwargs = v._construct_axes_dict() - if v._constructor is Series: + kwargs = first_not_none._construct_axes_dict() + if first_not_none._constructor is Series: backup = create_series_with_explicit_dtype( **kwargs, dtype_if_empty=object ) else: - backup = v._constructor(**kwargs) + backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] @@ -1346,14 +1346,10 @@ def first_not_none(values): # values are not series or array-like but scalars else: - # only coerce dates if we find at least 1 datetime - should_coerce = any(isinstance(x, Timestamp) for x in values) # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return Series(values, index=key_index)._convert( - datetime=True, coerce=should_coerce - ) + return Series(values, index=key_index) else: # Handle cases like BinGrouper @@ -1703,7 +1699,7 @@ def _wrap_aggregated_output( if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 7a7ac58b9d11b..873f24b9685e3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -202,14 +202,14 @@ class providing the base-class of operations. functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing ->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) +>>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP You can write >>> (df.groupby('group') ... .pipe(f) ... .pipe(g, arg1=a) -... .pipe(h, arg2=b, arg3=c)) +... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP which is much more readable. @@ -2017,7 +2017,9 @@ def cumcount(self, ascending: bool = True): Essentially this is equivalent to - >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) + .. code-block:: python + + self.apply(lambda x: pd.Series(np.arange(len(x)), x.index)) Parameters ---------- diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2f50845fda4dc..9bd098d1d49a3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -79,16 +79,51 @@ class Grouper: -------- Syntactic sugar for ``df.groupby('A')`` - >>> df.groupby(Grouper(key='A')) - - Specify a resample operation on the column 'date' - - >>> df.groupby(Grouper(key='date', freq='60s')) - - Specify a resample operation on the level 'date' on the columns axis - with a frequency of 60s - - >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) + >>> df = pd.DataFrame( + ... { + ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"], + ... "Speed": [100, 5, 200, 300, 15], + ... } + ... ) + >>> df + Animal Speed + 0 Falcon 100 + 1 Parrot 5 + 2 Falcon 200 + 3 Falcon 300 + 4 Parrot 15 + >>> df.groupby(pd.Grouper(key="Animal")).mean() + Speed + Animal + Falcon 200 + Parrot 10 + + Specify a resample operation on the column 'Publish date' + + >>> df = pd.DataFrame( + ... { + ... "Publish date": [ + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-02"), + ... pd.Timestamp("2000-01-09"), + ... pd.Timestamp("2000-01-16") + ... ], + ... "ID": [0, 1, 2, 3], + ... "Price": [10, 20, 30, 40] + ... } + ... ) + >>> df + Publish date ID Price + 0 2000-01-02 0 10 + 1 2000-01-02 1 20 + 2 2000-01-09 2 30 + 3 2000-01-16 3 40 + >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean() + ID Price + Publish date + 2000-01-02 0.5 15.0 + 2000-01-09 2.0 30.0 + 2000-01-16 3.0 40.0 """ _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index d2cee5d94422c..e56de83d4dae4 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -219,6 +219,37 @@ def to_pydatetime(self) -> np.ndarray: def freq(self): return self._get_values().inferred_freq + def isocalendar(self): + """ + Returns a DataFrame with the year, week, and day calculated according to + the ISO 8601 standard. + + .. versionadded:: 1.1.0 + + Returns + ------- + DataFrame + with columns year, week and day + + See Also + -------- + Timestamp.isocalendar + datetime.date.isocalendar + + Examples + -------- + >>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT])) + >>> ser.dt.isocalendar() + year week day + 0 2009 53 5 + 1 + >>> ser.dt.isocalendar().week + 0 53 + 1 + Name: week, dtype: UInt32 + """ + return self._get_values().isocalendar().set_index(self._parent.index) + @delegate_names( delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index df58593bc930c..530aaee24c7fb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -48,6 +48,7 @@ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, + pandas_dtype, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -68,6 +69,7 @@ from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray +from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com from pandas.core.indexers import deprecate_ndim_indexing @@ -292,10 +294,19 @@ def __new__( name = maybe_extract_name(name, data, cls) + if dtype is not None: + dtype = pandas_dtype(dtype) + if "tz" in kwargs: + tz = kwargs.pop("tz") + validate_tz_from_dtype(dtype, tz) + dtype = tz_to_dtype(tz) + if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() + data_dtype = getattr(data, "dtype", None) + # range if isinstance(data, RangeIndex): return RangeIndex(start=data, copy=copy, dtype=dtype, name=name) @@ -303,43 +314,39 @@ def __new__( return RangeIndex.from_range(data, dtype=dtype, name=name) # categorical - elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + elif is_categorical_dtype(data_dtype) or is_categorical_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.category import CategoricalIndex return _maybe_asobject(dtype, CategoricalIndex, data, copy, name, **kwargs) # interval - elif is_interval_dtype(data) or is_interval_dtype(dtype): + elif is_interval_dtype(data_dtype) or is_interval_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas.core.indexes.interval import IntervalIndex return _maybe_asobject(dtype, IntervalIndex, data, copy, name, **kwargs) - elif ( - is_datetime64_any_dtype(data) - or is_datetime64_any_dtype(dtype) - or "tz" in kwargs - ): + elif is_datetime64_any_dtype(data_dtype) or is_datetime64_any_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import DatetimeIndex return _maybe_asobject(dtype, DatetimeIndex, data, copy, name, **kwargs) - elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(data_dtype) or is_timedelta64_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import TimedeltaIndex return _maybe_asobject(dtype, TimedeltaIndex, data, copy, name, **kwargs) - elif is_period_dtype(data) or is_period_dtype(dtype): + elif is_period_dtype(data_dtype) or is_period_dtype(dtype): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 from pandas import PeriodIndex return _maybe_asobject(dtype, PeriodIndex, data, copy, name, **kwargs) # extension dtype - elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): + elif is_extension_array_dtype(data_dtype) or is_extension_array_dtype(dtype): if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() @@ -3286,7 +3293,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator + target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: if isinstance(self, ABCRangeIndex): @@ -3838,7 +3845,7 @@ def values(self) -> np.ndarray: return self._data.view(np.ndarray) @cache_readonly - @doc(IndexOpsMixin.array) # type: ignore + @doc(IndexOpsMixin.array) def array(self) -> ExtensionArray: array = self._data if isinstance(array, np.ndarray): @@ -4568,10 +4575,7 @@ def get_value(self, series: "Series", key): ------- scalar or Series """ - if not is_scalar(key): - # if key is not a scalar, directly raise an error (the code below - # would convert to numpy arrays and raise later any way) - GH29926 - raise InvalidIndexError(key) + self._check_indexing_error(key) try: # GH 20882, 21257 @@ -4592,6 +4596,12 @@ def get_value(self, series: "Series", key): return self._get_values_for_loc(series, loc, key) + def _check_indexing_error(self, key): + if not is_scalar(key): + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) + def _should_fallback_to_positional(self) -> bool: """ If an integer key is not found, should we fall back to positional indexing? @@ -5573,7 +5583,7 @@ def ensure_index(index_like, copy: bool = False): return Index(index_like) -def _ensure_has_len(seq): +def ensure_has_len(seq): """ If seq is an iterator, put its values into a list. """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 68d6229e798f5..1ec6cf8fd7b4e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -89,6 +89,7 @@ def _new_DatetimeIndex(cls, d): "date", "time", "timetz", + "isocalendar", ] + DatetimeArray._bool_ops, DatetimeArray, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7aa1456846612..42e0d228dab09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2288,7 +2288,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # GH7774: preserve dtype/tz if target is empty and not an Index. # target may be an iterator - target = ibase._ensure_has_len(target) + target = ibase.ensure_has_len(target) if len(target) == 0 and not isinstance(target, Index): idx = self.levels[level] attrs = idx._get_attributes_dict() @@ -2333,23 +2333,21 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # -------------------------------------------------------------------- # Indexing Methods - def get_value(self, series, key): - # Label-based + def _check_indexing_error(self, key): if not is_hashable(key) or is_iterator(key): # We allow tuples if they are hashable, whereas other Index # subclasses require scalar. # We have to explicitly exclude generators, as these are hashable. raise InvalidIndexError(key) - try: - loc = self.get_loc(key) - except KeyError: - if is_integer(key): - loc = key - else: - raise - - return self._get_values_for_loc(series, loc, key) + def _should_fallback_to_positional(self) -> bool: + """ + If an integer key is not found, should we fall back to positional indexing? + """ + if not self.nlevels: + return False + # GH#33355 + return self.levels[0]._should_fallback_to_positional() def _get_values_for_loc(self, series: "Series", loc, key): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 22a44d65a947a..b74399ed86fbd 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -827,7 +827,7 @@ def _getitem_nested_tuple(self, tup: Tuple): # this is iterative obj = self.obj axis = 0 - for i, key in enumerate(tup): + for key in tup: if com.is_null_slice(key): axis += 1 @@ -1420,7 +1420,7 @@ def _is_scalar_access(self, key: Tuple) -> bool: if len(key) != self.ndim: return False - for i, k in enumerate(key): + for k in key: if not is_integer(k): return False @@ -2234,8 +2234,7 @@ def is_nested_tuple(tup, labels) -> bool: if not isinstance(tup, tuple): return False - for i, k in enumerate(tup): - + for k in tup: if is_list_like(k) or isinstance(k, slice): return isinstance(labels, ABCMultiIndex) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 1090f862acb8a..7f06fb3a7788c 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -10,7 +10,6 @@ IntBlock, ObjectBlock, TimeDeltaBlock, - _block_shape, _safe_reshape, make_block, ) @@ -31,7 +30,6 @@ "TimeDeltaBlock", "_safe_reshape", "make_block", - "_block_shape", "BlockManager", "SingleBlockManager", "concatenate_block_managers", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d12b78f8d046f..185b0f4da2627 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,9 +6,9 @@ import numpy as np -from pandas._libs import NaT, Timestamp, algos as libalgos, lib, writers +from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals -from pandas._libs.tslibs import Timedelta, conversion +from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg @@ -243,6 +243,8 @@ def make_block(self, values, placement=None) -> "Block": """ if placement is None: placement = self.mgr_locs + if self.is_extension: + values = _block_shape(values, ndim=self.ndim) return make_block(values, placement=placement, ndim=self.ndim) @@ -279,6 +281,7 @@ def __setstate__(self, state): def _slice(self, slicer): """ return a slice of my values """ + return self.values[slicer] def getitem_block(self, slicer, new_mgr_locs=None): @@ -354,13 +357,12 @@ def _split_op_result(self, result) -> List["Block"]: nbs = [] for i, loc in enumerate(self.mgr_locs): vals = result[i] - nv = _block_shape(vals, ndim=self.ndim) - block = self.make_block(values=nv, placement=[loc]) + block = self.make_block(values=vals, placement=[loc]) nbs.append(block) return nbs if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result, ndim=self.ndim)) + result = self.make_block(result) return [result] @@ -1264,9 +1266,6 @@ def take_nd(self, indexer, axis: int, new_mgr_locs=None, fill_value=lib.no_defau def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis, stacklevel=7) - # We use block_shape for ExtensionBlock subclasses, which may call here - # via a super. - new_values = _block_shape(new_values, ndim=self.ndim) return [self.make_block(values=new_values)] def shift(self, periods: int, axis: int = 0, fill_value=None): @@ -1385,15 +1384,13 @@ def equals(self, other) -> bool: return False return array_equivalent(self.values, other.values) - def _unstack(self, unstacker, new_columns, fill_value, value_columns): + def _unstack(self, unstacker, fill_value, new_placement): """ Return a list of unstacked blocks of self Parameters ---------- unstacker : reshape._Unstacker - new_columns : Index - All columns of the unstacked BlockManager. fill_value : int Only used in ExtensionBlock._unstack @@ -1404,17 +1401,17 @@ def _unstack(self, unstacker, new_columns, fill_value, value_columns): mask : array_like of bool The mask of columns of `blocks` we should keep. """ - new_items = unstacker.get_new_columns(value_columns) - new_placement = new_columns.get_indexer(new_items) new_values, mask = unstacker.get_new_values( self.values.T, fill_value=fill_value ) mask = mask.any(0) + # TODO: in all tests we have mask.all(); can we rely on that? + new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [make_block(new_values, placement=new_placement)] + blocks = [self.make_block_same_class(new_values, placement=new_placement)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): @@ -1736,14 +1733,40 @@ def _can_hold_element(self, element: Any) -> bool: return True def _slice(self, slicer): - """ return a slice of my values """ - # slice the category + """ + Return a slice of my values. + + Parameters + ---------- + slicer : slice, ndarray[int], or a tuple of these + Valid (non-reducing) indexer for self.values. + + Returns + ------- + np.ndarray or ExtensionArray + """ # return same dims as we currently have + if not isinstance(slicer, tuple) and self.ndim == 2: + # reached via getitem_block via _slice_take_blocks_ax0 + # TODO(EA2D): wont be necessary with 2D EAs + slicer = (slicer, slice(None)) if isinstance(slicer, tuple) and len(slicer) == 2: - if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") - slicer = slicer[1] + first = slicer[0] + if not isinstance(first, slice): + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", first + ) + # GH#32959 only full-slicers along fake-dim0 are valid + # TODO(EA2D): wont be necessary with 2D EAs + new_locs = self.mgr_locs[first] + if len(new_locs): + # effectively slice(None) + slicer = slicer[1] + else: + raise AssertionError( + "invalid slicing for a 1-ndim ExtensionArray", slicer + ) return self.values[slicer] @@ -1775,7 +1798,14 @@ def interpolate( ) def diff(self, n: int, axis: int = 1) -> List["Block"]: + if axis == 0 and n != 0: + # n==0 case will be a no-op so let is fall through + # Since we only have one column, the result will be all-NA. + # Create this result by shifting along axis=0 past the length of + # our values. + return super().diff(len(self.values), axis=0) if axis == 1: + # TODO(EA2D): unnecessary with 2D EAs # we are by definition 1D. axis = 0 return super().diff(n, axis) @@ -1846,7 +1876,7 @@ def where( return [self.make_block_same_class(result, placement=self.mgr_locs)] - def _unstack(self, unstacker, new_columns, fill_value, value_columns): + def _unstack(self, unstacker, fill_value, new_placement): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the # values of the array. For EA-backed blocks, this would require @@ -1856,10 +1886,9 @@ def _unstack(self, unstacker, new_columns, fill_value, value_columns): n_rows = self.shape[-1] dummy_arr = np.arange(n_rows) - new_items = unstacker.get_new_columns(value_columns) - new_placement = new_columns.get_indexer(new_items) new_values, mask = unstacker.get_new_values(dummy_arr, fill_value=-1) mask = mask.any(0) + # TODO: in all tests we have mask.all(); can we rely on that? blocks = [ self.make_block_same_class( @@ -2011,12 +2040,7 @@ def array_values(self): def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA - result = super().iget(key) - if isinstance(result, np.datetime64): - result = Timestamp(result) - elif isinstance(result, np.timedelta64): - result = Timedelta(result) - return result + return self.array_values().reshape(self.shape)[key] def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs @@ -2203,15 +2227,6 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def _slice(self, slicer): - """ return a slice of my values """ - if isinstance(slicer, tuple): - col, loc = slicer - if not com.is_null_slice(col) and col != 0: - raise IndexError(f"{self} only contains one item") - return self.values[loc] - return self.values[slicer] - def diff(self, n: int, axis: int = 0) -> List["Block"]: """ 1st discrete difference. @@ -2254,7 +2269,7 @@ def concat_same_type(self, to_concat): values = values.astype(object, copy=False) placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) - return self.make_block(_block_shape(values, self.ndim), placement=placement) + return self.make_block(values, placement=placement) return super().concat_same_type(to_concat) def fillna(self, value, limit=None, inplace=False, downcast=None): @@ -2420,7 +2435,6 @@ def f(mask, val, idx): # TODO: allow EA once reshape is supported values = values.reshape(shape) - values = _block_shape(values, ndim=self.ndim) return values if self.ndim == 2: @@ -2660,9 +2674,7 @@ def concat_same_type(self, to_concat): ) placement = self.mgr_locs if self.ndim == 2 else slice(len(values)) # not using self.make_block_same_class as values can be object dtype - return self.make_block( - _block_shape(values, ndim=self.ndim), placement=placement - ) + return self.make_block(values, placement=placement) def replace( self, @@ -2771,16 +2783,15 @@ def _extend_blocks(result, blocks=None): return blocks -def _block_shape(values, ndim=1, shape=None): +def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ guarantee the shape of the values to be at least 1 d """ if values.ndim < ndim: - if shape is None: - shape = values.shape - if not is_extension_array_dtype(values): - # TODO: https://github.com/pandas-dev/pandas/issues/23023 + shape = values.shape + if not is_extension_array_dtype(values.dtype): + # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1,) + shape)) + values = values.reshape(tuple((1,) + shape)) # type: ignore return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bfb16b48d832c..e693341d10a55 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -7,14 +7,13 @@ import numpy as np -from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label +from pandas._libs import internals as libinternals, lib +from pandas._typing import ArrayLike, DtypeObj, Label, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, - maybe_convert_objects, maybe_promote, ) from pandas.core.dtypes.common import ( @@ -33,6 +32,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -48,8 +48,6 @@ make_block, ) -from pandas.io.formats.printing import pprint_thing - # TODO: flexible with index=None and/or items=None T = TypeVar("T", bound="BlockManager") @@ -325,7 +323,7 @@ def __repr__(self) -> str: output += f"\nAxis {i}: {ax}" for block in self.blocks: - output += f"\n{pprint_thing(block)}" + output += f"\n{block}" return output def _verify_integrity(self) -> None: @@ -626,11 +624,8 @@ def comp(s, regex=False): """ if isna(s): return isna(values) - if isinstance(s, (Timedelta, Timestamp)) and getattr(s, "tz", None) is None: - return _compare_or_regex_search( - maybe_convert_objects(values), s.asm8, regex - ) + s = com.maybe_box_datetimelike(s) return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for s in src_list] @@ -643,11 +638,10 @@ def comp(s, regex=False): # replace ALWAYS will return a list rb = [blk if inplace else blk.copy()] for i, (s, d) in enumerate(zip(src_list, dest_list)): - # TODO: assert/validate that `d` is always a scalar? new_rb: List[Block] = [] for b in rb: m = masks[i][b.mgr_locs.indexer] - convert = i == src_len + convert = i == src_len # only convert once at the end result = b._replace_coerce( mask=m, to_replace=s, @@ -880,6 +874,7 @@ def to_dict(self, copy: bool = True): for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) + # TODO(EA2D): the combine will be unnecessary with 2D EAs return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()} def fast_xs(self, loc: int) -> ArrayLike: @@ -984,6 +979,14 @@ def iget(self, i: int) -> "SingleBlockManager": self.axes[1], ) + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) + return values + def idelete(self, indexer): """ Delete selected locations in-place (new block and array, same BlockManager) @@ -1310,6 +1313,10 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_value=lib.no_default): blk = self.blocks[0] if sl_type in ("slice", "mask"): + # GH#32959 EABlock would fail since we cant make 0-width + # TODO(EA2D): special casing unnecessary with 2D EAs + if sllen == 0: + return [] return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: @@ -1452,8 +1459,11 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": for blk in self.blocks: blk_cols = self.items[blk.mgr_locs.indexer] + new_items = unstacker.get_new_columns(blk_cols) + new_placement = new_columns.get_indexer(new_items) + blocks, mask = blk._unstack( - unstacker, new_columns, fill_value, value_columns=blk_cols, + unstacker, fill_value, new_placement=new_placement ) new_blocks.extend(blocks) @@ -1590,33 +1600,6 @@ def fast_xs(self, loc): """ raise NotImplementedError("Use series._values[loc] instead") - def concat( - self, to_concat: List["SingleBlockManager"], new_axis: Index - ) -> "SingleBlockManager": - """ - Concatenate a list of SingleBlockManagers into a single - SingleBlockManager. - - Used for pd.concat of Series objects with axis=0. - - Parameters - ---------- - to_concat : list of SingleBlockManagers - new_axis : Index of the result - - Returns - ------- - SingleBlockManager - """ - - blocks = [obj.blocks[0] for obj in to_concat] - values = concat_compat([x.values for x in blocks]) - - new_block = make_block(values, placement=slice(0, len(values), 1)) - - mgr = SingleBlockManager(new_block, new_axis) - return mgr - # -------------------------------------------------------------------- # Constructor Helpers @@ -1905,7 +1888,9 @@ def _merge_blocks( return blocks -def _compare_or_regex_search(a, b, regex=False): +def _compare_or_regex_search( + a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], regex: bool = False +) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1922,35 +1907,67 @@ def _compare_or_regex_search(a, b, regex=False): ------- mask : array_like of bool """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], + a: Union[ArrayLike, Scalar], + b: Union[ArrayLike, Scalar], + ) -> Union[ArrayLike, bool]: + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and ( + isinstance(a, np.ndarray) or isinstance(b, np.ndarray) + ): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + if isinstance(b, np.ndarray): + type_names[1] = f"ndarray(dtype={b.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + return result + if not regex: op = lambda x: operator.eq(x, b) else: op = np.vectorize( - lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, str) + else False ) - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) + # GH#32621 use mask to avoid comparing to NAs + if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + elif isinstance(b, np.ndarray) and not isinstance(a, np.ndarray): + mask = np.reshape(~(isna(b)), b.shape) + elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray): + mask = ~(isna(a) | isna(b)) + if isinstance(a, np.ndarray): + a = a[mask] + if isinstance(b, np.ndarray): + b = b[mask] if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy - result = False - else: - result = op(a) + return _check_comparison_types(False, a, b) - if is_scalar(result) and (is_a_array or is_b_array): - type_names = [type(a).__name__, type(b).__name__] + result = op(a) - if is_a_array: - type_names[0] = f"ndarray(dtype={a.dtype})" + if isinstance(result, np.ndarray): + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool) + tmp[mask] = result + result = tmp - if is_b_array: - type_names[1] = f"ndarray(dtype={b.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - return result + return _check_comparison_types(result, a, b) def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d6ba9d763366b..c14c4a311d66c 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,7 +10,7 @@ from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 -from pandas._typing import ArrayLike, Level +from pandas._typing import Level from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like @@ -224,7 +224,7 @@ def _get_opstr(op): }[op] -def _get_op_name(op, special): +def _get_op_name(op, special: bool) -> str: """ Find the name to attach to this method according to conventions for special and non-special methods. @@ -385,42 +385,6 @@ def _align_method_SERIES(left, right, align_asobject=False): return left, right -def _construct_result( - left: ABCSeries, result: ArrayLike, index: ABCIndexClass, name, -): - """ - Construct an appropriately-labelled Series from the result of an op. - - Parameters - ---------- - left : Series - result : ndarray or ExtensionArray - index : Index - name : object - - Returns - ------- - Series - In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. - """ - if isinstance(result, tuple): - # produced by divmod or rdivmod - return ( - _construct_result(left, result[0], index=index, name=name), - _construct_result(left, result[1], index=index, name=name), - ) - - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - out = left._constructor(result, index=index) - out = out.__finalize__(left) - - # Set the result's name after __finalize__ is called because __finalize__ - # would set it back to self.name - out.name = name - return out - - def _arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid @@ -439,7 +403,7 @@ def wrapper(left, right): rvalues = extract_array(right, extract_numpy=True) result = arithmetic_op(lvalues, rvalues, op, str_rep) - return _construct_result(left, result, index=left.index, name=res_name) + return left._construct_result(result, name=res_name) wrapper.__name__ = op_name return wrapper @@ -466,7 +430,7 @@ def wrapper(self, other): res_values = comparison_op(lvalues, rvalues, op, str_rep) - return _construct_result(self, res_values, index=self.index, name=res_name) + return self._construct_result(res_values, name=res_name) wrapper.__name__ = op_name return wrapper @@ -488,7 +452,7 @@ def wrapper(self, other): rvalues = extract_array(other, extract_numpy=True) res_values = logical_op(lvalues, rvalues, op) - return _construct_result(self, res_values, index=self.index, name=res_name) + return self._construct_result(res_values, name=res_name) wrapper.__name__ = op_name return wrapper diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 7b03b4b449ea5..449a477646c02 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -29,11 +29,14 @@ def _make_flex_doc(op_name, typ): if typ == "series": base_doc = _flex_doc_SERIES + if op_desc["reverse"]: + base_doc += _see_also_reverse_SERIES.format( + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], + ) doc_no_examples = base_doc.format( desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc["reverse"], series_returns=op_desc["series_returns"], ) if op_desc["series_examples"]: @@ -375,12 +378,22 @@ def _make_flex_doc(op_name, typ): }, } +_py_num_ref = """see + `Python documentation + `_ + for more details""" _op_names = list(_op_descriptions.keys()) for key in _op_names: reverse_op = _op_descriptions[key]["reverse"] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() _op_descriptions[reverse_op]["reverse"] = key + _op_descriptions[key][ + "see_also_desc" + ] = f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}" + _op_descriptions[reverse_op][ + "see_also_desc" + ] = f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}" _flex_doc_SERIES = """ Return {desc} of series and other, element-wise (binary operator `{op_name}`). @@ -403,10 +416,12 @@ def _make_flex_doc(op_name, typ): Returns ------- {series_returns} +""" +_see_also_reverse_SERIES = """ See Also -------- -Series.{reverse} +Series.{reverse} : {see_also_desc}. """ _arith_doc_FRAME = """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 281586879a158..1e93597d92a5d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -17,7 +17,7 @@ import pandas.core.algorithms as algos from pandas.core.base import DataError, ShallowMixin -from pandas.core.generic import _shared_docs +from pandas.core.generic import NDFrame, _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby @@ -773,7 +773,7 @@ def fillna(self, method, limit=None): """ return self._upsample(method, limit=limit) - @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs) + @doc(NDFrame.interpolate, **_shared_docs_kwargs) def interpolate( self, method="linear", diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index bd90592e57485..a868e663b06a5 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -9,6 +9,7 @@ from pandas._typing import FrameOrSeriesUnion, Label +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas import DataFrame, Index, MultiIndex, Series @@ -457,12 +458,13 @@ def get_result(self): # stack blocks if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) - - mgr = self.objs[0]._mgr.concat( - [x._mgr for x in self.objs], self.new_axes[0] - ) cons = self.objs[0]._constructor - return cons(mgr, name=name).__finalize__(self, method="concat") + + arrs = [ser._values for ser in self.objs] + + res = concat_compat(arrs, axis=0) + result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype) + return result.__finalize__(self, method="concat") # combine as columns in a frame else: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d9e0206d73b95..882e3e0a649cc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -24,7 +24,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series @@ -143,7 +142,7 @@ def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort return [l.take(indexer) for l in to_sort] - def _make_sorted_values(self, values): + def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort sorted_values = algos.take_nd(values, indexer, axis=0) @@ -206,6 +205,9 @@ def get_new_values(self, values, fill_value=None): # we can simply reshape if we don't have a mask if mask_all and len(values): + # TODO: Under what circumstances can we rely on sorted_values + # matching values? When that holds, we can slice instead + # of take (in particular for EAs) new_values = ( sorted_values.reshape(length, width, stride) .swapaxes(1, 2) @@ -413,7 +415,7 @@ def unstack(obj, level, fill_value=None): level = obj.index._get_level_number(level) if isinstance(obj, DataFrame): - if isinstance(obj.index, MultiIndex): + if isinstance(obj.index, MultiIndex) or not obj._can_fast_transpose: return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) @@ -429,14 +431,14 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): - if obj._is_mixed_type: + if not obj._can_fast_transpose: unstacker = _Unstacker(obj.index, level=level) - blocks = obj._mgr.unstack(unstacker, fill_value=fill_value) - return obj._constructor(blocks) + mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) + return obj._constructor(mgr) else: return _Unstacker( obj.index, level=level, constructor=obj._constructor, - ).get_result(obj.values, value_columns=obj.columns, fill_value=fill_value) + ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) def _unstack_extension_series(series, level, fill_value): @@ -462,31 +464,10 @@ def _unstack_extension_series(series, level, fill_value): Each column of the DataFrame will have the same dtype as the input Series. """ - # Implementation note: the basic idea is to - # 1. Do a regular unstack on a dummy array of integers - # 2. Followup with a columnwise take. - # We use the dummy take to discover newly-created missing values - # introduced by the reshape. - from pandas.core.reshape.concat import concat - - dummy_arr = np.arange(len(series)) - # fill_value=-1, since we will do a series.values.take later - result = _Unstacker(series.index, level=level).get_result( - dummy_arr, value_columns=None, fill_value=-1 - ) - - out = [] - values = extract_array(series, extract_numpy=False) - - for col, indices in result.items(): - out.append( - Series( - values.take(indices.values, allow_fill=True, fill_value=fill_value), - name=col, - index=result.index, - ) - ) - return concat(out, axis="columns", copy=False, keys=result.columns) + # Defer to the logic in ExtensionBlock._unstack + df = series.to_frame() + result = df.unstack(level=level, fill_value=fill_value) + return result.droplevel(level=0, axis=1) def stack(frame, level=-1, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index 66caa4623f9ad..3f5927828e541 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -14,6 +14,7 @@ Optional, Tuple, Type, + Union, ) import warnings @@ -22,7 +23,7 @@ from pandas._config import get_option from pandas._libs import lib, properties, reshape, tslibs -from pandas._typing import Axis, DtypeObj, Label +from pandas._typing import ArrayLike, Axis, DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -880,32 +881,35 @@ def __getitem__(self, key): if isinstance(key, (list, tuple)): key = unpack_1tuple(key) - if key_is_scalar or isinstance(self.index, MultiIndex): + if is_integer(key) and self.index._should_fallback_to_positional(): + return self._values[key] + + elif key_is_scalar: + return self._get_value(key) + + if ( + isinstance(key, tuple) + and is_hashable(key) + and isinstance(self.index, MultiIndex) + ): # Otherwise index.get_value will raise InvalidIndexError try: - result = self.index.get_value(self, key) + result = self._get_value(key) return result - except InvalidIndexError: - if not isinstance(self.index, MultiIndex): - raise - except (KeyError, ValueError): - if isinstance(key, tuple) and isinstance(self.index, MultiIndex): - # kludge - pass - else: - raise + except KeyError: + # We still have the corner case where this tuple is a key + # in the first level of our MultiIndex + return self._get_values_tuple(key) - if not key_is_scalar: - # avoid expensive checks if we know we have a scalar - if is_iterator(key): - key = list(key) + if is_iterator(key): + key = list(key) - if com.is_bool_indexer(key): - key = check_bool_indexer(self.index, key) - key = np.asarray(key, dtype=bool) - return self._get_values(key) + if com.is_bool_indexer(key): + key = check_bool_indexer(self.index, key) + key = np.asarray(key, dtype=bool) + return self._get_values(key) return self._get_with(key) @@ -2608,12 +2612,10 @@ def _binop(self, other, func, level=None, fill_value=None): if not isinstance(other, Series): raise AssertionError("Other operand must be Series") - new_index = self.index this = self if not self.index.equals(other.index): this, other = self.align(other, level=level, join="outer", copy=False) - new_index = this.index this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) @@ -2621,9 +2623,46 @@ def _binop(self, other, func, level=None, fill_value=None): result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - ret = ops._construct_result(self, result, new_index, name) + ret = this._construct_result(result, name) return ret + def _construct_result( + self, result: Union[ArrayLike, Tuple[ArrayLike, ArrayLike]], name: Label + ) -> Union["Series", Tuple["Series", "Series"]]: + """ + Construct an appropriately-labelled Series from the result of an op. + + Parameters + ---------- + result : ndarray or ExtensionArray + name : Label + + Returns + ------- + Series + In the case of __divmod__ or __rdivmod__, a 2-tuple of Series. + """ + if isinstance(result, tuple): + # produced by divmod or rdivmod + + res1 = self._construct_result(result[0], name=name) + res2 = self._construct_result(result[1], name=name) + + # GH#33427 assertions to keep mypy happy + assert isinstance(res1, Series) + assert isinstance(res2, Series) + return (res1, res2) + + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + out = self._constructor(result, index=self.index) + out = out.__finalize__(self) + + # Set the result's name after __finalize__ is called because __finalize__ + # would set it back to self.name + out.name = name + return out + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -3912,7 +3951,7 @@ def _needs_reindex_multi(self, axes, method, level): """ return False - @Appender(generic._shared_docs["align"] % _shared_doc_kwargs) + @doc(NDFrame.align, **_shared_doc_kwargs) def align( self, other, @@ -4173,7 +4212,7 @@ def fillna( downcast=downcast, ) - @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) + @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, to_replace=None, @@ -4192,7 +4231,7 @@ def replace( method=method, ) - @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) + @doc(NDFrame.shift, **_shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 52d9a81489db4..76b851d8ac923 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2297,7 +2297,7 @@ def _get_series_list(self, others): if isinstance(others, ABCSeries): return [others] elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=others)] + return [Series(others._values, index=idx)] elif isinstance(others, ABCDataFrame): return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: diff --git a/pandas/io/common.py b/pandas/io/common.py index 0fce8f5382686..ff527de79c387 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -351,8 +351,9 @@ def get_handle( 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is 'zip' or inferred as - 'zip', other entries passed as additional compression options. + no compression). If dict and compression mode is one of + {'zip', 'gzip', 'bz2'}, or inferred as one of the above, + other entries passed as additional compression options. .. versionchanged:: 1.0.0 @@ -360,6 +361,11 @@ def get_handle( and other keys as compression options if compression mode is 'zip'. + .. versionchanged:: 1.1.0 + + Passing compression options as keys in dict is now + supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True @@ -394,19 +400,28 @@ def get_handle( if compression: + # GH33398 the type ignores here seem related to mypy issue #5382; + # it may be possible to remove them once that is resolved. + # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode) + f = gzip.open( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = gzip.GzipFile(fileobj=path_or_buf) + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args # type: ignore + ) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File(path_or_buf, mode) + f = bz2.BZ2File( + path_or_buf, mode, **compression_args # type: ignore + ) else: - f = bz2.BZ2File(path_or_buf) + f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 5d4925620e75f..cd7045e7f2d2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,15 +7,18 @@ from pandas.io.common import stringify_path -def to_feather(df: DataFrame, path): +def to_feather(df: DataFrame, path, **kwargs): """ - Write a DataFrame to the feather-format + Write a DataFrame to the binary Feather format. Parameters ---------- df : DataFrame path : string file path, or file-like object + **kwargs : + Additional keywords passed to `pyarrow.feather.write_feather`. + .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather @@ -58,7 +61,7 @@ def to_feather(df: DataFrame, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path) + feather.write_feather(df, path, **kwargs) def read_feather(path, columns=None, use_threads: bool = True): diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 6e68c1cf5e27e..69e9b111a6c20 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -231,7 +231,7 @@ def _pull_field( js: Dict[str, Any], spec: Union[List, str] ) -> Union[Scalar, Iterable]: """Internal function to pull field""" - result = js # type: ignore + result = js if isinstance(spec, list): for field in spec: result = result[field] @@ -251,7 +251,7 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: # null, otherwise return an empty list if not isinstance(result, Iterable): if pd.isnull(result): - result = [] # type: ignore + result = [] else: raise TypeError( f"{js} has non iterable value {result} for path {spec}. " diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a22251b29da54..3dd87ae6ed758 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1916,9 +1916,7 @@ def is_indexed(self) -> bool: if not hasattr(self.table, "cols"): # e.g. if infer hasn't been called yet, self.table will be None. return False - # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute - # 'error: "None" has no attribute "cols"' - return getattr(self.table.cols, self.cname).is_indexed # type: ignore + return getattr(self.table.cols, self.cname).is_indexed def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8f3aa60b7a9cc..b9b43685415d1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -356,10 +356,9 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): time_delta = dates - stata_epoch d["delta"] = time_delta._values.astype(np.int64) // 1000 # microseconds if days or year: - # ignore since mypy reports that DatetimeIndex has no year/month date_index = DatetimeIndex(dates) - d["year"] = date_index.year # type: ignore - d["month"] = date_index.month # type: ignore + d["year"] = date_index.year + d["month"] = date_index.month if days: days_in_ns = dates.astype(np.int64) - to_datetime( d["year"], format="%Y" diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index bc8346fd48433..46941e437a4ce 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -902,7 +902,11 @@ def _plot_colorbar(self, ax, **kwds): # For a more detailed description of the issue # see the following link: # https://github.com/ipython/ipython/issues/11215 - img = ax.collections[0] + + # GH33389, if ax is used multiple times, we should always + # use the last one which contains the latest information + # about the ax + img = ax.collections[-1] cbar = self.fig.colorbar(img, ax=ax, **kwds) if _mpl_ge_3_0_0(): diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d54fc73b495ba..3a0cdc90dfd5c 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -28,10 +28,7 @@ def _args_adjust(self): values = values[~isna(values)] _, self.bins = np.histogram( - values, - bins=self.bins, - range=self.kwds.get("range", None), - weights=self.kwds.get("weights", None), + values, bins=self.bins, range=self.kwds.get("range", None) ) if is_list_like(self.bottom): @@ -77,6 +74,14 @@ def _make_plot(self): kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) + + # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, + # and each sub-array (10,) will be called in each iteration. If users only + # provide 1D array, we assume the same weights is used for all iterations + weights = kwds.get("weights", None) + if weights is not None and np.ndim(weights) != 1: + kwds["weights"] = weights[:, i] + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 9a6ae76658949..56c5647d865d3 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -734,7 +734,7 @@ def test_dti_cmp_object_dtype(self): result = dti == other expected = np.array([True] * 5 + [False] * 5) tm.assert_numpy_array_equal(result, expected) - msg = "Cannot compare type" + msg = ">=' not supported between instances of 'Timestamp' and 'Timedelta'" with pytest.raises(TypeError, match=msg): dti >= other diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index ce50266c756a8..a5c18a25f8e16 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -19,6 +19,9 @@ def data(): ([False, pd.NA], False, False, pd.NA, False), ([pd.NA], False, True, pd.NA, pd.NA), ([], False, True, False, True), + # GH-33253: all True / all False values buggy with skipna=False + ([True, True], True, True, True, True), + ([False, False], False, False, False, False), ], ) def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): @@ -49,7 +52,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if op == "sum": assert isinstance(getattr(s, op)(), np.int_) elif op == "prod": - assert isinstance(getattr(s, op)(), np.int64) + assert isinstance(getattr(s, op)(), np.int_) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index c470f677b5386..67b491165b8cc 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -88,6 +88,14 @@ def test_min_max_with_nan(self, skipna): assert _min == 2 assert _max == 1 + @pytest.mark.parametrize("function", ["min", "max"]) + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_only_nan(self, function, skipna): + # https://github.com/pandas-dev/pandas/issues/33450 + cat = Categorical([np.nan], categories=[1, 2], ordered=True) + result = getattr(cat, function)(skipna=skipna) + assert result is np.nan + @pytest.mark.parametrize("method", ["min", "max"]) def test_deprecate_numeric_only_min_max(self, method): # GH 25303 diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 4a62a35e23d93..43936d8b95bd6 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -70,7 +70,7 @@ def test_integer_array_constructor(): expected = integer_array([1, 2, 3, np.nan], dtype="int64") tm.assert_extension_array_equal(result, expected) - msg = r".* should be .* numpy array. Use the 'integer_array' function instead" + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" with pytest.raises(TypeError, match=msg): IntegerArray(values.tolist(), mask) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index 515013e95c717..a02501e2dcbf2 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -34,7 +34,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - if op in {"sum", "min", "max"}: + if op in {"sum", "prod", "min", "max"}: assert isinstance(result, np.int64) else: assert isinstance(result, int) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 7e7762d8973a0..fef11f0ff3bb2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -104,6 +104,13 @@ class TestSetitem: def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) + + if result.dtype.subtype.kind in ["i", "u"]: + msg = "Cannot set float NaN to integer-backed IntervalArray" + with pytest.raises(ValueError, match=msg): + result[0] = np.NaN + return + result[0] = np.nan expected_left = Index([left._na_value] + list(left[1:])) @@ -182,7 +189,7 @@ def test_arrow_array_missing(): import pyarrow as pa from pandas.core.arrays._arrow_utils import ArrowIntervalType - arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) arr[1] = None result = pa.array(arr) @@ -209,8 +216,8 @@ def test_arrow_array_missing(): @pyarrow_skip @pytest.mark.parametrize( "breaks", - [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], - ids=["int", "datetime64[ns]"], + [[0.0, 1.0, 2.0, 3.0], pd.date_range("2017", periods=4, freq="D")], + ids=["float", "datetime64[ns]"], ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index fe35344f46688..5b703cfe8fae5 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -10,7 +10,7 @@ import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.period import Period, PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex @@ -222,6 +222,11 @@ def test_getitem_2d(self, arr1d): result = arr2d[:3, 0] tm.assert_equal(result, expected) + # Scalar lookup + result = arr2d[-1, 0] + expected = arr1d[-1] + assert result == expected + def test_setitem(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") @@ -897,3 +902,13 @@ def test_searchsorted_datetimelike_with_listlike_invalid_dtype(values, arg): msg = "[Unexpected type|Cannot compare]" with pytest.raises(TypeError, match=msg): values.searchsorted(arg) + + +@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) +def test_period_index_construction_from_strings(klass): + # https://github.com/pandas-dev/pandas/issues/26109 + strings = ["2020Q1", "2020Q2"] * 2 + data = klass(strings) + result = PeriodIndex(data, freq="Q") + expected = PeriodIndex([Period(s) for s in strings]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 0b95d3aa19366..d3ced2f1b1f07 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -37,6 +37,7 @@ def test_registered(): ([pd.Period("2017", "D"), None], None, [17167, iNaT]), (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + (pd.period_range("2017", periods=4, freq="Q"), None, [188, 189, 190, 191]), ], ) def test_period_array_ok(data, freq, expected): diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 3b1e896857117..f33f960e8e341 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -10,10 +10,22 @@ class BaseCastingTests(BaseExtensionTests): """Casting to and from ExtensionDtypes""" def test_astype_object_series(self, all_data): - ser = pd.Series({"A": all_data}) + ser = pd.Series(all_data, name="A") result = ser.astype(object) assert isinstance(result._mgr.blocks[0], ObjectBlock) + def test_astype_object_frame(self, all_data): + df = pd.DataFrame({"A": all_data}) + + result = df.astype(object) + blk = result._data.blocks[0] + assert isinstance(blk, ObjectBlock), type(blk) + + # FIXME: these currently fail; dont leave commented-out + # check that we can compare the dtypes + # cmp = result.dtypes.equals(df.dtypes) + # assert not cmp.any() + def test_tolist(self, data): result = pd.Series(data).tolist() expected = list(data) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 71c7198e32a8b..dc94bffd320b1 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -40,6 +40,34 @@ def test_iloc_frame(self, data): result = df.iloc[:4, 0] self.assert_series_equal(result, expected) + # GH#32959 slice columns with step + result = df.iloc[:, ::2] + self.assert_frame_equal(result, df[["A"]]) + result = df[["B", "A"]].iloc[:, ::2] + self.assert_frame_equal(result, df[["B"]]) + + def test_iloc_frame_single_block(self, data): + # GH#32959 null slice along index, slice along columns with single-block + df = pd.DataFrame({"A": data}) + + result = df.iloc[:, :] + self.assert_frame_equal(result, df) + + result = df.iloc[:, :1] + self.assert_frame_equal(result, df) + + result = df.iloc[:, :2] + self.assert_frame_equal(result, df) + + result = df.iloc[:, ::2] + self.assert_frame_equal(result, df) + + result = df.iloc[:, 1:2] + self.assert_frame_equal(result, df.iloc[:, :0]) + + result = df.iloc[:, -1:] + self.assert_frame_equal(result, df) + def test_loc_series(self, data): ser = pd.Series(data) result = ser.loc[:3] diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index d2c3877de99ed..c9445ceec2c77 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -295,6 +295,14 @@ def test_unstack(self, data, index, obj): assert all( isinstance(result[col].array, type(data)) for col in result.columns ) + + if obj == "series": + # We should get the same result with to_frame+unstack+droplevel + df = ser.to_frame() + + alt = df.unstack(level=level).droplevel(0, axis=1) + self.assert_frame_equal(result, alt) + expected = ser.astype(object).unstack(level=level) result = result.astype(object) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index f55ec75b47dfa..725533765ca2c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - if np.isnan(expected): + if not skipna and s.isna().any(): expected = pd.NA + else: + expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 61c5925383f88..aa5a99282131a 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -170,8 +170,11 @@ def test_take_series(self, data): # ValueError: PandasArray must be 1-dimensional. super().test_take_series(data) - @pytest.mark.xfail(reason="astype doesn't recognize data.dtype") def test_loc_iloc_frame_single_dtype(self, data): + npdtype = data.dtype.numpy_dtype + if npdtype == object or npdtype == np.float64: + # GH#33125 + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_loc_iloc_frame_single_dtype(data) @@ -179,6 +182,8 @@ class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): # ValueError: Names should be list-like for a MultiIndex + if data_for_grouping.dtype.numpy_dtype == np.float64: + pytest.xfail(reason="GH#33125 astype doesn't recognize data.dtype") super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) @@ -276,7 +281,11 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestPrinting(BaseNumPyTests, base.BasePrintingTests): - pass + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_series_repr(self, data): + super().test_series_repr(data) @skip_nested @@ -321,6 +330,18 @@ class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): def test_concat_mixed_dtypes(self, data): super().test_concat_mixed_dtypes(data) + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat(self, data, in_frame): + super().test_concat(data, in_frame) + + @pytest.mark.xfail( + reason="GH#33125 PandasArray.astype does not recognize PandasDtype" + ) + def test_concat_all_na_block(self, data_missing, in_frame): + super().test_concat_all_na_block(data_missing, in_frame) + @skip_nested def test_merge(self, data, na_value): # Fails creating expected diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 198a228b621b4..694bbee59606f 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -3,6 +3,8 @@ from pandas.errors import PerformanceWarning +from pandas.core.dtypes.common import is_object_dtype + import pandas as pd from pandas import SparseDtype import pandas._testing as tm @@ -309,7 +311,25 @@ def test_searchsorted(self, data_for_sorting, as_series): class TestCasting(BaseSparseTests, base.BaseCastingTests): - pass + def test_astype_object_series(self, all_data): + # Unlike the base class, we do not expect the resulting Block + # to be ObjectBlock + ser = pd.Series(all_data, name="A") + result = ser.astype(object) + assert is_object_dtype(result._data.blocks[0].dtype) + + def test_astype_object_frame(self, all_data): + # Unlike the base class, we do not expect the resulting Block + # to be ObjectBlock + df = pd.DataFrame({"A": all_data}) + + result = df.astype(object) + assert is_object_dtype(result._data.blocks[0].dtype) + + # FIXME: these currently fail; dont leave commented-out + # check that we can compare the dtypes + # comp = result.dtypes.equals(df.dtypes) + # assert not comp.any() class TestArithmeticOps(BaseSparseTests, base.BaseArithmeticOpsTests): diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index ffdb6d41ebda5..6a9248e1cba1e 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -64,18 +64,15 @@ def test_diff_datetime_axis1(self, tz): 1: date_range("2010", freq="D", periods=2, tz=tz), } ) - if tz is None: - result = df.diff(axis=1) - expected = DataFrame( - { - 0: pd.TimedeltaIndex(["NaT", "NaT"]), - 1: pd.TimedeltaIndex(["0 days", "0 days"]), - } - ) - tm.assert_frame_equal(result, expected) - else: - with pytest.raises(NotImplementedError): - result = df.diff(axis=1) + + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) def test_diff_timedelta(self): # GH#4533 @@ -118,3 +115,46 @@ def test_diff_axis(self): tm.assert_frame_equal( df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) ) + + @pytest.mark.xfail( + reason="GH#32995 needs to operate column-wise or do inference", + raises=AssertionError, + ) + def test_diff_period(self): + # GH#32995 Don't pass an incorrect axis + # TODO(EA2D): this bug wouldn't have happened with 2D EA + pi = pd.date_range("2016-01-01", periods=3).to_period("D") + df = pd.DataFrame({"A": pi}) + + result = df.diff(1, axis=1) + + # TODO: should we make Block.diff do type inference? or maybe algos.diff? + expected = (df - pd.NaT).astype(object) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = pd.DataFrame({"A": [np.nan, np.nan, np.nan], "B": df["B"] / 2}) + + result = df.diff(axis=1) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_large_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = df * np.nan + + result = df.diff(axis=1, periods=3) + tm.assert_frame_equal(result, expected) + + def test_diff_axis1_mixed_dtypes_negative_periods(self): + # GH#32995 operate column-wise when we have mixed dtypes and axis=1 + df = pd.DataFrame({"A": range(3), "B": 2 * np.arange(3, dtype=np.float64)}) + + expected = pd.DataFrame({"A": -1.0 * df["A"], "B": df["B"] * np.nan}) + + result = df.diff(axis=1, periods=-1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e1fc7e9d7c5b8..0255759513e28 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1147,59 +1147,6 @@ def test_any_all_level_axis_none_raises(self, method): # --------------------------------------------------------------------- # Matrix-like - def test_dot(self): - a = DataFrame( - np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] - ) - b = DataFrame( - np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] - ) - - result = a.dot(b) - expected = DataFrame( - np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] - ) - # Check alignment - b1 = b.reindex(index=reversed(b.index)) - result = a.dot(b) - tm.assert_frame_equal(result, expected) - - # Check series argument - result = a.dot(b["one"]) - tm.assert_series_equal(result, expected["one"], check_names=False) - assert result.name is None - - result = a.dot(b1["one"]) - tm.assert_series_equal(result, expected["one"], check_names=False) - assert result.name is None - - # can pass correct-length arrays - row = a.iloc[0].values - - result = a.dot(row) - expected = a.dot(a.iloc[0]) - tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError, match="Dot product shape mismatch"): - a.dot(row[:-1]) - - a = np.random.rand(1, 5) - b = np.random.rand(5, 1) - A = DataFrame(a) - - # TODO(wesm): unused - B = DataFrame(b) # noqa - - # it works - result = A.dot(b) - - # unaligned - df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - - with pytest.raises(ValueError, match="aligned"): - df.dot(df2) - def test_matmul(self): # matmul test is for GH 10259 a = DataFrame( diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index bf9eeb532b43b..1a07780462ea3 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1165,6 +1165,11 @@ def test_lots_of_operators_string(self, df): expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] tm.assert_frame_equal(res, expect) + def test_missing_attribute(self, df): + message = "module 'pandas' has no attribute 'thing'" + with pytest.raises(AttributeError, match=message): + df.eval("@pd.thing") + def test_failing_quote(self, df): with pytest.raises(SyntaxError): df.query("`it's` > `that's`") diff --git a/pandas/tests/generic/methods/test_dot.py b/pandas/tests/generic/methods/test_dot.py new file mode 100644 index 0000000000000..ecbec6b06e923 --- /dev/null +++ b/pandas/tests/generic/methods/test_dot.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class DotSharedTests: + @pytest.fixture + def obj(self): + raise NotImplementedError + + @pytest.fixture + def other(self) -> DataFrame: + """ + other is a DataFrame that is indexed so that obj.dot(other) is valid + """ + raise NotImplementedError + + @pytest.fixture + def expected(self, obj, other) -> DataFrame: + """ + The expected result of obj.dot(other) + """ + raise NotImplementedError + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + raise NotImplementedError + + def test_dot_equiv_values_dot(self, obj, other, expected): + # `expected` is constructed from obj.values.dot(other.values) + result = obj.dot(other) + tm.assert_equal(result, expected) + + def test_dot_2d_ndarray(self, obj, other, expected): + # Check ndarray argument; in this case we get matching values, + # but index/columns may not match + result = obj.dot(other.values) + assert np.all(result == expected.values) + + def test_dot_1d_ndarray(self, obj, expected): + # can pass correct-length array + row = obj.iloc[0] if obj.ndim == 2 else obj + + result = obj.dot(row.values) + expected = obj.dot(row) + self.reduced_dim_assert(result, expected) + + def test_dot_series(self, obj, other, expected): + # Check series argument + result = obj.dot(other["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_series_alignment(self, obj, other, expected): + result = obj.dot(other.iloc[::-1]["1"]) + self.reduced_dim_assert(result, expected["1"]) + + def test_dot_aligns(self, obj, other, expected): + # Check index alignment + other2 = other.iloc[::-1] + result = obj.dot(other2) + tm.assert_equal(result, expected) + + def test_dot_shape_mismatch(self, obj): + msg = "Dot product shape mismatch" + # exception raised is of type Exception + with pytest.raises(Exception, match=msg): + obj.dot(obj.values[:3]) + + def test_dot_misaligned(self, obj, other): + msg = "matrices are not aligned" + with pytest.raises(ValueError, match=msg): + obj.dot(other.T) + + +class TestSeriesDot(DotSharedTests): + @pytest.fixture + def obj(self): + return Series(np.random.randn(4), index=["p", "q", "r", "s"]) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T + + @pytest.fixture + def expected(self, obj, other): + return Series(np.dot(obj.values, other.values), index=other.columns) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_almost_equal(result, expected) + + +class TestDataFrameDot(DotSharedTests): + @pytest.fixture + def obj(self): + return DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + + @pytest.fixture + def other(self): + return DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["1", "2"] + ) + + @pytest.fixture + def expected(self, obj, other): + return DataFrame( + np.dot(obj.values, other.values), index=obj.index, columns=other.columns + ) + + @classmethod + def reduced_dim_assert(cls, result, expected): + """ + Assertion about results with 1 fewer dimension that self.obj + """ + tm.assert_series_equal(result, expected, check_names=False) + assert result.name is None diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py new file mode 100644 index 0000000000000..d307eef8beb62 --- /dev/null +++ b/pandas/tests/generic/test_finalize.py @@ -0,0 +1,782 @@ +""" +An exhaustive list of pandas methods exercising NDFrame.__finalize__. +""" +import operator +import re + +import numpy as np +import pytest + +import pandas as pd + +# TODO: +# * Binary methods (mul, div, etc.) +# * Binary outputs (align, etc.) +# * top-level methods (concat, merge, get_dummies, etc.) +# * window +# * cumulative reductions + +not_implemented_mark = pytest.mark.xfail(reason="not implemented") + +mi = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["A", "B"]) + +frame_data = ({"A": [1]},) +frame_mi_data = ({"A": [1, 2, 3, 4]}, mi) + + +# Tuple of +# - Callable: Constructor (Series, DataFrame) +# - Tuple: Constructor args +# - Callable: pass the constructed value with attrs set to this. + +_all_methods = [ + ( + pd.Series, + (np.array([0], dtype="float64")), + operator.methodcaller("view", "int64"), + ), + (pd.Series, ([0],), operator.methodcaller("take", [])), + (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])), + (pd.Series, ([0],), operator.methodcaller("repeat", 2)), + pytest.param( + (pd.Series, ([0],), operator.methodcaller("reset_index")), + marks=pytest.mark.xfail, + ), + (pd.Series, ([0],), operator.methodcaller("reset_index", drop=True)), + pytest.param( + (pd.Series, ([0],), operator.methodcaller("to_frame")), marks=pytest.mark.xfail + ), + (pd.Series, (0, mi), operator.methodcaller("count", level="A")), + (pd.Series, ([0, 0],), operator.methodcaller("drop_duplicates")), + (pd.Series, ([0, 0],), operator.methodcaller("duplicated")), + (pd.Series, ([0, 0],), operator.methodcaller("round")), + (pd.Series, ([0, 0],), operator.methodcaller("rename", lambda x: x + 1)), + (pd.Series, ([0, 0],), operator.methodcaller("rename", "name")), + (pd.Series, ([0, 0],), operator.methodcaller("set_axis", ["a", "b"])), + (pd.Series, ([0, 0],), operator.methodcaller("reindex", [1, 0])), + (pd.Series, ([0, 0],), operator.methodcaller("drop", [0])), + (pd.Series, (pd.array([0, pd.NA]),), operator.methodcaller("fillna", 0)), + (pd.Series, ([0, 0],), operator.methodcaller("replace", {0: 1})), + (pd.Series, ([0, 0],), operator.methodcaller("shift")), + (pd.Series, ([0, 0],), operator.methodcaller("isin", [0, 1])), + (pd.Series, ([0, 0],), operator.methodcaller("between", 0, 2)), + (pd.Series, ([0, 0],), operator.methodcaller("isna")), + (pd.Series, ([0, 0],), operator.methodcaller("isnull")), + (pd.Series, ([0, 0],), operator.methodcaller("notna")), + (pd.Series, ([0, 0],), operator.methodcaller("notnull")), + (pd.Series, ([1],), operator.methodcaller("add", pd.Series([1]))), + # TODO: mul, div, etc. + ( + pd.Series, + ([0], pd.period_range("2000", periods=1)), + operator.methodcaller("to_timestamp"), + ), + ( + pd.Series, + ([0], pd.date_range("2000", periods=1)), + operator.methodcaller("to_period"), + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("dot", pd.DataFrame(index=["A"])), + ), + marks=pytest.mark.xfail(reason="Implement binary finalize"), + ), + (pd.DataFrame, frame_data, operator.methodcaller("transpose")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", "A")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", np.array([True]))), + (pd.DataFrame, ({("A", "a"): [1]},), operator.methodcaller("__getitem__", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("query", "A == 1")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("eval", "A + 1")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("select_dtypes", include="int")), + (pd.DataFrame, frame_data, operator.methodcaller("assign", b=1)), + (pd.DataFrame, frame_data, operator.methodcaller("set_axis", ["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("reindex", [0, 1])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", columns=["A"])), + (pd.DataFrame, frame_data, operator.methodcaller("drop", index=[0])), + (pd.DataFrame, frame_data, operator.methodcaller("rename", columns={"A": "a"})), + (pd.DataFrame, frame_data, operator.methodcaller("rename", index=lambda x: x)), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("fillna", method="ffill")), + (pd.DataFrame, frame_data, operator.methodcaller("set_index", "A")), + (pd.DataFrame, frame_data, operator.methodcaller("reset_index")), + (pd.DataFrame, frame_data, operator.methodcaller("isna")), + (pd.DataFrame, frame_data, operator.methodcaller("isnull")), + (pd.DataFrame, frame_data, operator.methodcaller("notna")), + (pd.DataFrame, frame_data, operator.methodcaller("notnull")), + (pd.DataFrame, frame_data, operator.methodcaller("dropna")), + (pd.DataFrame, frame_data, operator.methodcaller("drop_duplicates")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("duplicated")), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("sort_values", by="A")), + (pd.DataFrame, frame_data, operator.methodcaller("sort_index")), + (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")), + (pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")), + (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel"),), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("add", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + # TODO: div, mul, etc. + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("combine_first", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("update", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + {"A": [1], "B": [1]}, + operator.methodcaller("pivot_table", columns="A"), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("stack")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("explode", "A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack"),), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + ({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]},), + operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("diff")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("applymap", lambda x: x)), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("append", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("append", pd.DataFrame({"B": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("merge", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("corr")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("cov")), + marks=[ + not_implemented_mark, + pytest.mark.filterwarnings("ignore::RuntimeWarning"), + ], + ), + pytest.param( + ( + pd.DataFrame, + frame_data, + operator.methodcaller("corrwith", pd.DataFrame(*frame_data)), + ), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("count")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("count", level="A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("nunique")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("idxmin")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("idxmax")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("mode")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile", q=[0.25, 0.75])), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("quantile")), + marks=not_implemented_mark, + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Period("2000", "D")]), + operator.methodcaller("to_timestamp"), + ), + ( + pd.DataFrame, + ({"A": [1]}, [pd.Timestamp("2000")]), + operator.methodcaller("to_period", freq="D"), + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", [1])), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", pd.Series([1]))), + marks=not_implemented_mark, + ), + pytest.param( + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("isin", pd.DataFrame({"A": [1]})), + ), + marks=not_implemented_mark, + ), + (pd.DataFrame, frame_data, operator.methodcaller("swapaxes", 0, 1)), + (pd.DataFrame, frame_mi_data, operator.methodcaller("droplevel", "A")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pop", "A")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("squeeze")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.Series, ([1, 2],), operator.methodcaller("squeeze")), + # marks=not_implemented_mark, + ), + (pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")), + (pd.DataFrame, frame_data, operator.methodcaller("rename_axis", columns="a")), + # Unary ops + (pd.DataFrame, frame_data, operator.neg), + (pd.Series, [1], operator.neg), + (pd.DataFrame, frame_data, operator.pos), + (pd.Series, [1], operator.pos), + (pd.DataFrame, frame_data, operator.inv), + (pd.Series, [1], operator.inv), + (pd.DataFrame, frame_data, abs), + pytest.param((pd.Series, [1], abs), marks=not_implemented_mark), + pytest.param((pd.DataFrame, frame_data, round), marks=not_implemented_mark), + (pd.Series, [1], round), + (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])), + (pd.DataFrame, frame_mi_data, operator.methodcaller("xs", "a")), + (pd.Series, (1, mi), operator.methodcaller("xs", "a")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("get", "A")), + marks=not_implemented_mark, + ), + ( + pd.DataFrame, + frame_data, + operator.methodcaller("reindex_like", pd.DataFrame({"A": [1, 2, 3]})), + ), + ( + pd.Series, + frame_data, + operator.methodcaller("reindex_like", pd.Series([0, 1, 2])), + ), + (pd.DataFrame, frame_data, operator.methodcaller("add_prefix", "_")), + (pd.DataFrame, frame_data, operator.methodcaller("add_suffix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_prefix", "_")), + (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_suffix", "_")), + (pd.Series, ([3, 2],), operator.methodcaller("sort_values")), + (pd.Series, ([1] * 10,), operator.methodcaller("head")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("head")), + (pd.Series, ([1] * 10,), operator.methodcaller("tail")), + (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("tail")), + (pd.Series, ([1, 2],), operator.methodcaller("sample", n=2, replace=True)), + (pd.DataFrame, (frame_data,), operator.methodcaller("sample", n=2, replace=True)), + (pd.Series, ([1, 2],), operator.methodcaller("astype", float)), + (pd.DataFrame, frame_data, operator.methodcaller("astype", float)), + (pd.Series, ([1, 2],), operator.methodcaller("copy")), + (pd.DataFrame, frame_data, operator.methodcaller("copy")), + (pd.Series, ([1, 2], None, object), operator.methodcaller("infer_objects")), + ( + pd.DataFrame, + ({"A": np.array([1, 2], dtype=object)},), + operator.methodcaller("infer_objects"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("convert_dtypes")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")), + marks=not_implemented_mark, + ), + (pd.Series, ([1, None, 3],), operator.methodcaller("interpolate")), + (pd.DataFrame, ({"A": [1, None, 3]},), operator.methodcaller("interpolate")), + (pd.Series, ([1, 2],), operator.methodcaller("clip", lower=1)), + (pd.DataFrame, frame_data, operator.methodcaller("clip", lower=1)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "H"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("asfreq", "H"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("at_time", "12:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("between_time", "12:00", "13:00"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("first", "3D"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("first", "3D"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("last", "3D"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("rank")), + (pd.DataFrame, frame_data, operator.methodcaller("rank")), + (pd.Series, ([1, 2],), operator.methodcaller("where", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))), + (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), + (pd.Series, ([1, 2],), operator.methodcaller("slice_shift")), + (pd.DataFrame, frame_data, operator.methodcaller("slice_shift")), + (pd.Series, (1, pd.date_range("2000", periods=4)), operator.methodcaller("tshift")), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tshift"), + ), + (pd.Series, ([1, 2],), operator.methodcaller("truncate", before=0)), + (pd.DataFrame, frame_data, operator.methodcaller("truncate", before=0)), + ( + pd.Series, + (1, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4, tz="UTC")), + operator.methodcaller("tz_convert", "CET"), + ), + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tz_localize", "CET"), + ), + pytest.param( + (pd.Series, ([1, 2],), operator.methodcaller("describe")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("describe")), + marks=not_implemented_mark, + ), + (pd.Series, ([1, 2],), operator.methodcaller("pct_change")), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("pct_change")), + marks=not_implemented_mark, + ), + (pd.Series, ([1],), operator.methodcaller("transform", lambda x: x - x.min())), + pytest.param( + ( + pd.DataFrame, + frame_mi_data, + operator.methodcaller("transform", lambda x: x - x.min()), + ), + marks=not_implemented_mark, + ), + (pd.Series, ([1],), operator.methodcaller("apply", lambda x: x)), + pytest.param( + (pd.DataFrame, frame_mi_data, operator.methodcaller("apply", lambda x: x)), + marks=not_implemented_mark, + ), + # Cumulative reductions + (pd.Series, ([1],), operator.methodcaller("cumsum")), + (pd.DataFrame, frame_data, operator.methodcaller("cumsum")), + # Reductions + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("any")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("sum")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("std")), + marks=not_implemented_mark, + ), + pytest.param( + (pd.DataFrame, frame_data, operator.methodcaller("mean")), + marks=not_implemented_mark, + ), +] + + +def idfn(x): + xpr = re.compile(r"'(.*)?'") + m = xpr.search(str(x)) + if m: + return m.group(1) + else: + return str(x) + + +@pytest.fixture(params=_all_methods, ids=lambda x: idfn(x[-1])) +def ndframe_method(request): + """ + An NDFrame method returning an NDFrame. + """ + return request.param + + +def test_finalize_called(ndframe_method): + cls, init_args, method = ndframe_method + ndframe = cls(*init_args) + + ndframe.attrs = {"a": 1} + result = method(ndframe) + + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Binary operations + + +@pytest.mark.parametrize("annotate", ["left", "right", "both"]) +@pytest.mark.parametrize( + "args", + [ + (1, pd.Series([1])), + (1, pd.DataFrame({"A": [1]})), + (pd.Series([1]), 1), + (pd.DataFrame({"A": [1]}), 1), + (pd.Series([1]), pd.Series([1])), + (pd.DataFrame({"A": [1]}), pd.DataFrame({"A": [1]})), + (pd.Series([1]), pd.DataFrame({"A": [1]})), + (pd.DataFrame({"A": [1]}), pd.Series([1])), + ], +) +def test_binops(args, annotate, all_arithmetic_functions): + # This generates 326 tests... Is that needed? + left, right = args + if annotate == "both" and isinstance(left, int) or isinstance(right, int): + return + + if isinstance(left, pd.DataFrame) or isinstance(right, pd.DataFrame): + pytest.xfail(reason="not implemented") + + if annotate in {"left", "both"} and not isinstance(left, int): + left.attrs = {"a": 1} + if annotate in {"left", "both"} and not isinstance(right, int): + right.attrs = {"a": 1} + + result = all_arithmetic_functions(left, right) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Accessors + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("capitalize"), + operator.methodcaller("casefold"), + pytest.param( + operator.methodcaller("cat", ["a"]), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + operator.methodcaller("contains", "a"), + operator.methodcaller("count", "a"), + operator.methodcaller("encode", "utf-8"), + operator.methodcaller("endswith", "a"), + pytest.param( + operator.methodcaller("extract", r"(\w)(\d)"), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + pytest.param( + operator.methodcaller("extract", r"(\w)(\d)"), + marks=pytest.mark.xfail(reason="finalize not called."), + ), + operator.methodcaller("find", "a"), + operator.methodcaller("findall", "a"), + operator.methodcaller("get", 0), + operator.methodcaller("index", "a"), + operator.methodcaller("len"), + operator.methodcaller("ljust", 4), + operator.methodcaller("lower"), + operator.methodcaller("lstrip"), + operator.methodcaller("match", r"\w"), + operator.methodcaller("normalize", "NFC"), + operator.methodcaller("pad", 4), + operator.methodcaller("partition", "a"), + operator.methodcaller("repeat", 2), + operator.methodcaller("replace", "a", "b"), + operator.methodcaller("rfind", "a"), + operator.methodcaller("rindex", "a"), + operator.methodcaller("rjust", 4), + operator.methodcaller("rpartition", "a"), + operator.methodcaller("rstrip"), + operator.methodcaller("slice", 4), + operator.methodcaller("slice_replace", 1, repl="a"), + operator.methodcaller("startswith", "a"), + operator.methodcaller("strip"), + operator.methodcaller("swapcase"), + operator.methodcaller("translate", {"a": "b"}), + operator.methodcaller("upper"), + operator.methodcaller("wrap", 4), + operator.methodcaller("zfill", 4), + operator.methodcaller("isalnum"), + operator.methodcaller("isalpha"), + operator.methodcaller("isdigit"), + operator.methodcaller("isspace"), + operator.methodcaller("islower"), + operator.methodcaller("isupper"), + operator.methodcaller("istitle"), + operator.methodcaller("isnumeric"), + operator.methodcaller("isdecimal"), + operator.methodcaller("get_dummies"), + ], + ids=idfn, +) +@not_implemented_mark +def test_string_method(method): + s = pd.Series(["a1"]) + s.attrs = {"a": 1} + result = method(s.str) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("to_period"), + operator.methodcaller("tz_localize", "CET"), + operator.methodcaller("normalize"), + operator.methodcaller("strftime", "%Y"), + operator.methodcaller("round", "H"), + operator.methodcaller("floor", "H"), + operator.methodcaller("ceil", "H"), + operator.methodcaller("month_name"), + operator.methodcaller("day_name"), + ], + ids=idfn, +) +@not_implemented_mark +def test_datetime_method(method): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", + [ + "date", + "time", + "timetz", + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "week", + "weekofyear", + "dayofweek", + "dayofyear", + "quarter", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + "daysinmonth", + "days_in_month", + ], +) +@not_implemented_mark +def test_datetime_property(attr): + s = pd.Series(pd.date_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] +) +@not_implemented_mark +def test_timedelta_property(attr): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = getattr(s.dt, attr) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", [operator.methodcaller("total_seconds")], +) +@not_implemented_mark +def test_timedelta_methods(method): + s = pd.Series(pd.timedelta_range("2000", periods=4)) + s.attrs = {"a": 1} + result = method(s.dt) + assert result.attrs == {"a": 1} + + +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("add_categories", ["c"]), + operator.methodcaller("as_ordered"), + operator.methodcaller("as_unordered"), + lambda x: getattr(x, "codes"), + operator.methodcaller("remove_categories", "a"), + operator.methodcaller("remove_unused_categories"), + operator.methodcaller("rename_categories", {"a": "A", "b": "B"}), + operator.methodcaller("reorder_categories", ["b", "a"]), + operator.methodcaller("set_categories", ["A", "B"]), + ], +) +@not_implemented_mark +def test_categorical_accessor(method): + s = pd.Series(["a", "b"], dtype="category") + s.attrs = {"a": 1} + result = method(s.cat) + assert result.attrs == {"a": 1} + + +# ---------------------------------------------------------------------------- +# Groupby + + +@pytest.mark.parametrize( + "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] +) +@pytest.mark.parametrize( + "method", + [ + operator.methodcaller("sum"), + lambda x: x.agg("sum"), + lambda x: x.agg(["sum", "count"]), + lambda x: x.transform(lambda y: y), + lambda x: x.apply(lambda y: y), + ], +) +@not_implemented_mark +def test_groupby(obj, method): + obj.attrs = {"a": 1} + result = method(obj.groupby([0, 0])) + assert result.attrs == {"a": 1} diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 9fbcced75c327..9fcbabb07857e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -880,3 +880,24 @@ def test_apply_function_index_return(function): index=pd.Index([1, 2, 3], name="id"), ) tm.assert_series_equal(result, expected) + + +def test_apply_function_with_indexing(): + # GH: 33058 + df = pd.DataFrame( + {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} + ) + + def fn(x): + x.col2[x.index[-1]] = 0 + return x.col2 + + result = df.groupby(["col1"], as_index=False).apply(fn) + expected = pd.Series( + [1, 2, 0, 4, 5, 0], + index=pd.MultiIndex.from_tuples( + [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] + ), + name="col2", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py new file mode 100644 index 0000000000000..529f76bf692ce --- /dev/null +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -0,0 +1,70 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_mutate_groups(): + + # GH3380 + + df = pd.DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "cat3": [f"g{x}" for x in range(1, 15)], + "val": np.random.randint(100, size=14), + } + ) + + def f_copy(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + def f_no_copy(x): + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) + tm.assert_series_equal(grpby_copy, grpby_no_copy) + + +def test_no_mutate_but_looks_like(): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) + + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) + tm.assert_series_equal(result1, result2) + + +def test_apply_function_with_indexing(): + # GH: 33058 + df = pd.DataFrame( + {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} + ) + + def fn(x): + x.col2[x.index[-1]] = 0 + return x.col2 + + result = df.groupby(["col1"], as_index=False).apply(fn) + expected = pd.Series( + [1, 2, 0, 4, 5, 0], + index=pd.MultiIndex.from_tuples( + [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)] + ), + name="col2", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e570ea201cc3a..da8327f64e26f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1380,3 +1380,15 @@ def test_groupby_agg_non_numeric(): result = df.groupby([1, 2, 1]).nunique() tm.assert_frame_equal(result, expected) + + +def test_read_only_category_no_sort(): + # GH33410 + cats = np.array([1, 2]) + cats.flags.writeable = False + df = DataFrame( + {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} + ) + expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b")) + result = df.groupby("b", sort=False).mean() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b8d8f56512a69..c88d16e34eab8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -921,51 +921,6 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_mutate_groups(): - - # GH3380 - - df = DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.randint(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - grpby_copy = df.groupby("cat1").apply(f_copy) - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - def test_groupby_series_indexed_differently(): s1 = Series( [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], diff --git a/pandas/tests/indexes/datetimes/test_to_period.py b/pandas/tests/indexes/datetimes/test_to_period.py index 7b75e676a2c12..d82fc1ef6743b 100644 --- a/pandas/tests/indexes/datetimes/test_to_period.py +++ b/pandas/tests/indexes/datetimes/test_to_period.py @@ -1,3 +1,5 @@ +import warnings + import dateutil.tz from dateutil.tz import tzlocal import pytest @@ -75,6 +77,28 @@ def test_to_period_monthish(self): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): date_range("01-Jan-2012", periods=8, freq="EOM") + def test_to_period_infer(self): + # https://github.com/pandas-dev/pandas/issues/33358 + rng = date_range( + start="2019-12-22 06:40:00+00:00", + end="2019-12-22 08:45:00+00:00", + freq="5min", + ) + + with tm.assert_produces_warning(None): + # Using simple filter because we are not checking for the warning here + warnings.simplefilter("ignore", UserWarning) + + pi1 = rng.to_period("5min") + + with tm.assert_produces_warning(None): + # Using simple filter because we are not checking for the warning here + warnings.simplefilter("ignore", UserWarning) + + pi2 = rng.to_period() + + tm.assert_index_equal(pi1, pi2) + def test_period_dt64_round_trip(self): dti = date_range("1/1/2000", "1/7/2002", freq="B") pi = dti.to_period() diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 8c0dae433c8f4..a10bf6b6aa11a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -605,10 +605,6 @@ def test_get_loc_nan(self, level, nulls_fixture): key = ["b", "d"] levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) key[level] = nulls_fixture - - if nulls_fixture is pd.NA: - pytest.xfail("MultiIndex from pd.NA in np.array broken; see GH 31883") - idx = MultiIndex.from_product(levels) assert idx.get_loc(tuple(key)) == 3 diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py index 426341a53a5d1..b7f673428ae38 100644 --- a/pandas/tests/indexes/ranges/test_constructors.py +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -43,7 +43,7 @@ def test_constructor_invalid_args(self): r"kind, 0 was passed" ) with pytest.raises(TypeError, match=msg): - Index(0, 1000) + Index(0) @pytest.mark.parametrize( "args", diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 7e75b5324445e..54b22dbc53466 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -87,8 +87,8 @@ def test_series_getitem_returns_scalar( (lambda s: s[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^\(2000, 3, 4\)$"), (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), - (lambda s: s.__getitem__(len(s)), IndexError, "is out of bounds"), - (lambda s: s[len(s)], IndexError, "is out of bounds"), + (lambda s: s.__getitem__(len(s)), KeyError, ""), # match should include len(s) + (lambda s: s[len(s)], KeyError, ""), # match should include len(s) ( lambda s: s.iloc[len(s)], IndexError, diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 9d181bdcb9491..ed11af8ef68ad 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex +from pandas import DataFrame, Float64Index, Int64Index, MultiIndex import pandas._testing as tm @@ -126,7 +126,32 @@ def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # this works...for now df["A"].iloc[14] = 5 - assert df["A"][14] == 5 + assert df["A"].iloc[14] == 5 + + @pytest.mark.parametrize("dtype", [int, float]) + def test_getitem_intkey_leading_level( + self, multiindex_year_month_day_dataframe_random_data, dtype + ): + # GH#33355 dont fall-back to positional when leading level is int + ymd = multiindex_year_month_day_dataframe_random_data + levels = ymd.index.levels + ymd.index = ymd.index.set_levels([levels[0].astype(dtype)] + levels[1:]) + ser = ymd["A"] + mi = ser.index + assert isinstance(mi, MultiIndex) + if dtype is int: + assert isinstance(mi.levels[0], Int64Index) + else: + assert isinstance(mi.levels[0], Float64Index) + + assert 14 not in mi.levels[0] + assert not mi.levels[0]._should_fallback_to_positional() + assert not mi._should_fallback_to_positional() + + with pytest.raises(KeyError, match="14"): + ser[14] + with pytest.raises(KeyError, match="14"): + mi.get_value(ser, 14) # --------------------------------------------------------------------- # AMBIGUOUS CASES! @@ -140,7 +165,7 @@ def test_partial_loc_missing(self, multiindex_year_month_day_dataframe_random_da tm.assert_series_equal(result, expected) # need to put in some work here - + # FIXME: dont leave commented-out # self.ymd.loc[2000, 0] = 0 # assert (self.ymd.loc[2000]['A'] == 0).all() diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 1f19244cf76d3..853b92ea91274 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -236,6 +236,7 @@ def f(name, df2): f_index ) + # FIXME: dont leave commented-out # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -255,7 +256,11 @@ def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): assert notna(s.values[65:]).all() s[2000, 3, 10] = np.nan - assert isna(s[49]) + assert isna(s.iloc[49]) + + with pytest.raises(KeyError, match="49"): + # GH#33355 dont fall-back to positional when leading level is int + s[49] def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 31905b223b91d..0437052e2740d 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -241,6 +241,24 @@ def test_build_series(self): assert result == expected + def test_read_json_from_to_json_results(self): + # GH32383 + df = pd.DataFrame( + { + "_id": {"row_0": 0}, + "category": {"row_0": "Goods"}, + "recommender_id": {"row_0": 3}, + "recommender_name_jp": {"row_0": "浦田"}, + "recommender_name_en": {"row_0": "Urata"}, + "name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"}, + "name_en": {"row_0": "Hakata Dolls Matsuo"}, + } + ) + result1 = pd.read_json(df.to_json()) + result2 = pd.DataFrame.from_dict(json.loads(df.to_json())) + tm.assert_frame_equal(result1, df) + tm.assert_frame_equal(result2, df) + def test_to_json(self): df = self.df.copy() df.index.name = "idx" diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 841241d5124e0..59c9bd0a36d3d 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -143,3 +143,44 @@ def test_with_missing_lzma_runtime(): """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_gzip_compression_level(obj, method): + # GH33196 + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression="gzip") + compressed_size_default = os.path.getsize(path) + getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) + compressed_size_fast = os.path.getsize(path) + assert compressed_size_default < compressed_size_fast + + +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_bzip_compression_level(obj, method): + """GH33196 bzip needs file size > 100k to show a size difference between + compression levels, so here we just check if the call works when + compression is passed as a dict. + """ + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0038df78dd866..0755501ee6285 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -27,15 +29,15 @@ def check_error_on_write(self, df, exc): with tm.ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, expected=None, **kwargs): + def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): if expected is None: expected = df with tm.ensure_clean() as path: - to_feather(df, path) + to_feather(df, path, **write_kwargs) - result = read_feather(path, **kwargs) + result = read_feather(path, **read_kwargs) tm.assert_frame_equal(result, expected) def test_error(self): @@ -71,6 +73,10 @@ def test_basic(self): "dtns": pd.date_range("20130101", periods=3, freq="ns"), } ) + if pyarrow_version >= LooseVersion("0.16.1.dev"): + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) + df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) @@ -102,8 +108,8 @@ def test_read_columns(self): def test_unsupported_other(self): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # mixed python objects + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) @@ -148,3 +154,8 @@ def test_path_localpath(self): df = tm.makeDataFrame().reset_index() result = tm.round_trip_localpath(df.to_feather, pd.read_feather) tm.assert_frame_equal(df, result) + + @td.skip_if_no("pyarrow", min_version="0.16.1.dev") + def test_passthrough_keywords(self): + df = tm.makeDataFrame().reset_index() + self.check_round_trip(df, write_kwargs=dict(version=1)) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 3d73e983402a7..2c93dbb5b6b83 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -321,7 +321,7 @@ def test_invalid_table_attrs(self): url = self.banklist_data with pytest.raises(ValueError, match="No tables found"): self.read_html( - url, "First Federal Bank of Florida", attrs={"id": "tasdfable"} + url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) def _bank_data(self, *args, **kwargs): @@ -573,7 +573,9 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] + df = self.read_html(self.banklist_data, match="Metcalf", attrs={"id": "table"})[ + 0 + ] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -883,7 +885,7 @@ def test_wikipedia_states_table(self, datapath): def test_wikipedia_states_multiindex(self, datapath): data = datapath("io", "data", "html", "wikipedia_states.html") - result = self.read_html(data, "Arizona", index_col=0)[0] + result = self.read_html(data, match="Arizona", index_col=0)[0] assert result.shape == (60, 11) assert "Unnamed" in result.columns[-1][1] assert result.columns.nlevels == 2 diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4a9efe9554c6e..c84a09f21f46b 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1332,6 +1332,20 @@ def test_scatter_colors(self): np.array([1, 1, 1, 1], dtype=np.float64), ) + def test_scatter_colorbar_different_cmap(self): + # GH 33389 + import matplotlib.pyplot as plt + + df = pd.DataFrame({"x": [1, 2, 3], "y": [1, 3, 2], "c": [1, 2, 3]}) + df["x2"] = df["x"] + 1 + + fig, ax = plt.subplots() + df.plot("x", "y", c="c", kind="scatter", cmap="cividis", ax=ax) + df.plot("x2", "y", c="c", kind="scatter", cmap="magma", ax=ax) + + assert ax.collections[0].cmap.name == "cividis" + assert ax.collections[1].cmap.name == "magma" + @pytest.mark.slow def test_plot_bar(self): df = DataFrame( @@ -1682,6 +1696,25 @@ def test_hist_df(self): axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) + @pytest.mark.parametrize( + "weights", [0.1 * np.ones(shape=(100,)), 0.1 * np.ones(shape=(100, 2))] + ) + def test_hist_weights(self, weights): + # GH 33173 + np.random.seed(0) + df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,)))) + + ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) + ax2 = _check_plot_works(df.plot, kind="hist") + + patch_height_with_weights = [patch.get_height() for patch in ax1.patches] + + # original heights with no weights, and we manually multiply with example + # weights, so after multiplication, they should be almost same + expected_patch_height = [0.1 * patch.get_height() for patch in ax2.patches] + + tm.assert_almost_equal(patch_height_with_weights, expected_patch_height) + def _check_box_coord( self, patches, diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8fb035e085d40..fa62d5d8c4983 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -891,6 +891,30 @@ def test_all_any_params(self): with pytest.raises(NotImplementedError): s.all(bool_only=True) + def test_all_any_boolean(self): + # Check skipna, with boolean type + s1 = Series([pd.NA, True], dtype="boolean") + s2 = Series([pd.NA, False], dtype="boolean") + assert s1.all(skipna=False) is pd.NA # NA && True => NA + assert s1.all(skipna=True) + assert s2.any(skipna=False) is pd.NA # NA || False => NA + assert not s2.any(skipna=True) + + # GH-33253: all True / all False values buggy with skipna=False + s3 = Series([True, True], dtype="boolean") + s4 = Series([False, False], dtype="boolean") + assert s3.all(skipna=False) + assert not s4.any(skipna=False) + + # Check level TODO(GH-33449) result should also be boolean + s = pd.Series( + [False, False, True, True, False, True], + index=[0, 0, 1, 1, 2, 2], + dtype="boolean", + ) + tm.assert_series_equal(s.all(level=0), Series([False, True, False])) + tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + def test_timedelta64_analytics(self): # index min/max diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 12572648fca9e..7baeb8f5673bc 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -904,6 +904,25 @@ def test_compare_timedelta_ndarray(self): expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) + def test_compare_td64_ndarray(self): + # GG#33441 + arr = np.arange(5).astype("timedelta64[ns]") + td = pd.Timedelta(arr[1]) + + expected = np.array([False, True, False, False, False], dtype=bool) + + result = td == arr + tm.assert_numpy_array_equal(result, expected) + + result = arr == td + tm.assert_numpy_array_equal(result, expected) + + result = td != arr + tm.assert_numpy_array_equal(result, ~expected) + + result = arr != td + tm.assert_numpy_array_equal(result, ~expected) + @pytest.mark.skip(reason="GH#20829 is reverted until after 0.24.0") def test_compare_custom_object(self): """ @@ -943,7 +962,7 @@ def __gt__(self, other): def test_compare_unknown_type(self, val): # GH#20829 t = Timedelta("1s") - msg = "Cannot compare type Timedelta with type (int|str)" + msg = "not supported between instances of 'Timedelta' and '(int|str)'" with pytest.raises(TypeError, match=msg): t >= val with pytest.raises(TypeError, match=msg): @@ -984,7 +1003,7 @@ def test_ops_error_str(): with pytest.raises(TypeError, match=msg): left + right - msg = "Cannot compare type" + msg = "not supported between instances of" with pytest.raises(TypeError, match=msg): left > right diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 4581e736b2ea1..27aef8c4a9eb7 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -5,9 +5,61 @@ import pytest from pandas import Timestamp +import pandas._testing as tm class TestTimestampComparison: + def test_comparison_dt64_ndarray(self): + ts = Timestamp.now() + ts2 = Timestamp("2019-04-05") + arr = np.array([[ts.asm8, ts2.asm8]], dtype="M8[ns]") + + result = ts == arr + expected = np.array([[True, False]], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = arr == ts + tm.assert_numpy_array_equal(result, expected) + + result = ts != arr + tm.assert_numpy_array_equal(result, ~expected) + + result = arr != ts + tm.assert_numpy_array_equal(result, ~expected) + + result = ts2 < arr + tm.assert_numpy_array_equal(result, expected) + + result = arr < ts2 + tm.assert_numpy_array_equal(result, np.array([[False, False]], dtype=bool)) + + result = ts2 <= arr + tm.assert_numpy_array_equal(result, np.array([[True, True]], dtype=bool)) + + result = arr <= ts2 + tm.assert_numpy_array_equal(result, ~expected) + + result = ts >= arr + tm.assert_numpy_array_equal(result, np.array([[True, True]], dtype=bool)) + + result = arr >= ts + tm.assert_numpy_array_equal(result, np.array([[True, False]], dtype=bool)) + + @pytest.mark.parametrize("reverse", [True, False]) + def test_comparison_dt64_ndarray_tzaware(self, reverse, all_compare_operators): + op = getattr(operator, all_compare_operators.strip("__")) + + ts = Timestamp.now("UTC") + arr = np.array([ts.asm8, ts.asm8], dtype="M8[ns]") + + left, right = ts, arr + if reverse: + left, right = arr, ts + + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): + op(left, right) + def test_comparison_object_array(self): # GH#15183 ts = Timestamp("2011-01-03 00:00:00-0500", tz="US/Eastern") diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index a41f893e3753f..dd4bf642e68e8 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_interval_dtype + import pandas as pd import pandas._testing as tm @@ -266,7 +268,12 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict): # Test that it is a copy copy = series.copy(deep=True) - ns[ns.notna()] = np.nan + if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: + msg = "Cannot set float NaN to integer-backed IntervalArray" + with pytest.raises(ValueError, match=msg): + ns[ns.notna()] = np.nan + else: + ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index bea8cb8b105e7..1c54e2b988219 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -108,6 +108,16 @@ def test_replace_gh5319(self): expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) + def test_replace_timedelta_td64(self): + tdi = pd.timedelta_range(0, periods=5) + ser = pd.Series(tdi) + + # Using a single dict argument means we go through replace_list + result = ser.replace({ser[1]: ser[3]}) + + expected = pd.Series([ser[0], ser[3], ser[2], ser[3], ser[4]]) + tm.assert_series_equal(result, expected) + def test_replace_with_single_list(self): ser = pd.Series([0, 1, 2, 3, 4]) result = ser.replace([1, 2, 3]) @@ -241,6 +251,13 @@ def test_replace2(self): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + def test_replace_with_dictlike_and_string_dtype(self): + # GH 32621 + s = pd.Series(["one", "two", np.nan], dtype="string") + expected = pd.Series(["1", "2", np.nan]) + result = s.replace({"one": "1", "two": "2"}) + tm.assert_series_equal(expected, result) + def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 149d0aae8ab99..ab8618eb0a7d4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -17,38 +17,6 @@ def test_prod_numpy16_bug(self): assert not isinstance(result, Series) - def test_dot(self): - a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) - b = DataFrame( - np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] - ).T - - result = a.dot(b) - expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) - tm.assert_series_equal(result, expected) - - # Check index alignment - b2 = b.reindex(index=reversed(b.index)) - result = a.dot(b) - tm.assert_series_equal(result, expected) - - # Check ndarray argument - result = a.dot(b.values) - assert np.all(result == expected.values) - tm.assert_almost_equal(a.dot(b["2"].values), expected["2"]) - - # Check series argument - tm.assert_almost_equal(a.dot(b["1"]), expected["1"]) - tm.assert_almost_equal(a.dot(b2["1"]), expected["1"]) - - msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" - # exception raised is of type Exception - with pytest.raises(Exception, match=msg): - a.dot(a.values[:3]) - msg = "matrices are not aligned" - with pytest.raises(ValueError, match=msg): - a.dot(b.T) - def test_matmul(self): # matmul test is for GH #10259 a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index d22dc72eaaadd..e903e850ec36c 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -49,6 +49,7 @@ def test_dt_namespace_accessor(self): "ceil", "day_name", "month_name", + "isocalendar", ] ok_for_td = TimedeltaIndex._datetimelike_ops ok_for_td_methods = [ @@ -65,7 +66,7 @@ def get_expected(s, name): if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype("int64") - elif not is_list_like(result): + elif not is_list_like(result) or isinstance(result, pd.DataFrame): return result return Series(result, index=s.index, name=s.name) @@ -74,6 +75,8 @@ def compare(s, name): b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): assert a == b + elif isinstance(a, pd.DataFrame): + tm.assert_frame_equal(a, b) else: tm.assert_series_equal(a, b) @@ -665,3 +668,19 @@ def test_setitem_with_different_tz(self): dtype=object, ) tm.assert_series_equal(ser, expected) + + @pytest.mark.parametrize( + "input_series, expected_output", + [ + [["2020-01-01"], [[2020, 1, 3]]], + [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], + [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + ], + ) + def test_isocalendar(self, input_series, expected_output): + result = pd.to_datetime(pd.Series(input_series)).dt.isocalendar() + expected_frame = pd.DataFrame( + expected_output, columns=["year", "week", "day"], dtype="UInt32" + ) + tm.assert_frame_equal(result, expected_frame) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 6289c2efea7f1..6260d13524da3 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3624,3 +3624,12 @@ def test_string_array_extract(): result = result.astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("klass", [tuple, list, np.array, pd.Series, pd.Index]) +def test_cat_different_classes(klass): + # https://github.com/pandas-dev/pandas/issues/33425 + s = pd.Series(["a", "b", "c"]) + result = s.str.cat(klass(["x", "y", "z"])) + expected = pd.Series(["ax", "by", "cz"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index 6f6e32411a784..aab86d3a2df69 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import date, datetime import numpy as np import pytest @@ -25,3 +25,26 @@ def test_get_day_of_year_dt(): expected = (dt - dt.replace(month=1, day=1)).days + 1 assert result == expected + + +@pytest.mark.parametrize( + "input_date_tuple, expected_iso_tuple", + [ + [(2020, 1, 1), (2020, 1, 3)], + [(2019, 12, 31), (2020, 1, 2)], + [(2019, 12, 30), (2020, 1, 1)], + [(2009, 12, 31), (2009, 53, 4)], + [(2010, 1, 1), (2009, 53, 5)], + [(2010, 1, 3), (2009, 53, 7)], + [(2010, 1, 4), (2010, 1, 1)], + [(2006, 1, 1), (2005, 52, 7)], + [(2005, 12, 31), (2005, 52, 6)], + [(2008, 12, 28), (2008, 52, 7)], + [(2008, 12, 29), (2009, 1, 1)], + ], +) +def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tuple): + result = ccalendar.get_iso_calendar(*input_date_tuple) + expected_from_date_isocalendar = date(*input_date_tuple).isocalendar() + assert result == expected_from_date_isocalendar + assert result == expected_iso_tuple diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 71d02db10c7ba..17815c437249b 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,24 +1,11 @@ from functools import wraps import inspect from textwrap import dedent -from typing import ( - Any, - Callable, - List, - Mapping, - Optional, - Tuple, - Type, - TypeVar, - Union, - cast, -) +from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union, cast import warnings from pandas._libs.properties import cache_readonly # noqa - -FuncType = Callable[..., Any] -F = TypeVar("F", bound=FuncType) +from pandas._typing import F def deprecate( @@ -29,7 +16,7 @@ def deprecate( klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable[..., Any]: +) -> Callable[[F], F]: """ Return a new function that emits a deprecation warning on use. @@ -100,7 +87,7 @@ def deprecate_kwarg( new_arg_name: Optional[str], mapping: Optional[Union[Mapping[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable[..., Any]: +) -> Callable[[F], F]: """ Decorator to deprecate a keyword argument of a function. diff --git a/requirements-dev.txt b/requirements-dev.txt index ffbdfccced6a9..5cef428d35452 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,7 @@ # See that file for comments about the need/usage of each dependency. numpy>=1.15 -python-dateutil>=2.6.1 +python-dateutil>=2.7.3 pytz asv cython>=0.29.16 diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 5e62a98de548d..1281762c90496 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -101,6 +101,19 @@ "BusinessHour", "BusinessDay", "DateOffset", + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + "Float64Index", "TZ", "GIL", "strftime", diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index b476ab5a818c5..193fef026a96b 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -16,9 +16,8 @@ import sys import token import tokenize -from typing import IO, Callable, Iterable, List, Tuple +from typing import IO, Callable, FrozenSet, Iterable, List, Tuple -FILE_EXTENSIONS_TO_CHECK: Tuple[str, ...] = (".py", ".pyx", ".pxi.ini", ".pxd") PATHS_TO_IGNORE: Tuple[str, ...] = ("asv_bench/env",) @@ -293,6 +292,7 @@ def main( function: Callable[[IO[str]], Iterable[Tuple[int, str]]], source_path: str, output_format: str, + file_extensions_to_check: str, ) -> bool: """ Main entry point of the script. @@ -322,6 +322,10 @@ def main( is_failed: bool = False file_path: str = "" + FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( + file_extensions_to_check.split(",") + ) + if os.path.isfile(source_path): file_path = source_path with open(file_path, "r") as file_obj: @@ -370,7 +374,7 @@ def main( parser.add_argument( "--format", "-f", - default="{source_path}:{line_number}:{msg}.", + default="{source_path}:{line_number}:{msg}", help="Output format of the error message.", ) parser.add_argument( @@ -380,6 +384,11 @@ def main( required=True, help="Validation test case to check.", ) + parser.add_argument( + "--included-file-extensions", + default="py,pyx,pxd,pxi", + help="Coma seperated file extensions to check.", + ) args = parser.parse_args() @@ -388,5 +397,6 @@ def main( function=globals().get(args.validation_type), # type: ignore source_path=args.path, output_format=args.format, + file_extensions_to_check=args.included_file_extensions, ) ) diff --git a/setup.cfg b/setup.cfg index fda4ba4065e2f..6c42b27c7b015 100644 --- a/setup.cfg +++ b/setup.cfg @@ -126,6 +126,8 @@ ignore_missing_imports=True no_implicit_optional=True check_untyped_defs=True strict_equality=True +warn_redundant_casts = True +warn_unused_ignores = True [mypy-pandas.tests.*] check_untyped_defs=False diff --git a/setup.py b/setup.py index 338686bddd146..a2e01e08e8de2 100755 --- a/setup.py +++ b/setup.py @@ -747,7 +747,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): def setup_package(): setuptools_kwargs = { "install_requires": [ - "python-dateutil >= 2.6.1", + "python-dateutil >= 2.7.3", "pytz >= 2017.2", f"numpy >= {min_numpy_ver}", ], @@ -760,7 +760,7 @@ def setup_package(): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=["pandas", "pandas.*"]), - package_data={"": ["templates/*", "_libs/*.dll"]}, + package_data={"": ["templates/*", "_libs/**/*.dll"]}, ext_modules=maybe_cythonize(extensions, compiler_directives=directives), maintainer_email=EMAIL, description=DESCRIPTION, diff --git a/web/pandas/about/citing.md b/web/pandas/about/citing.md index d5cb64e58f0ad..25d2c86061daa 100644 --- a/web/pandas/about/citing.md +++ b/web/pandas/about/citing.md @@ -2,31 +2,35 @@ ## Citing pandas -If you use _pandas_ for a scientific publication, we would appreciate citations to one of the following papers: +If you use _pandas_ for a scientific publication, we would appreciate citations to the published software and the +following paper: + +- [pandas on Zenodo](https://zenodo.org/record/3715232#.XoqFyC2ZOL8), + Please find us on Zenodo and replace with the citation for the version you are using. You cna replace the full author + list from there with "The pandas development team" like in the example below. + + @software{reback2020pandas, + author = {The pandas development team}, + title = {pandas-dev/pandas: Pandas}, + month = feb, + year = 2020, + publisher = {Zenodo}, + version = {latest}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134} + } - [Data structures for statistical computing in python](https://conference.scipy.org/proceedings/scipy2010/pdfs/mckinney.pdf), McKinney, Proceedings of the 9th Python in Science Conference, Volume 445, 2010. - @inproceedings{mckinney2010data, - title={Data structures for statistical computing in python}, - author={Wes McKinney}, - booktitle={Proceedings of the 9th Python in Science Conference}, - volume={445}, - pages={51--56}, - year={2010}, - organization={Austin, TX} - } - - -- [pandas: a foundational Python library for data analysis and statistics](https://www.scribd.com/document/71048089/pandas-a-Foundational-Python-Library-for-Data-Analysis-and-Statistics), - McKinney, Python for High Performance and Scientific Computing, Volume 14, 2011. - - @article{mckinney2011pandas, - title={pandas: a foundational Python library for data analysis and statistics}, - author={Wes McKinney}, - journal={Python for High Performance and Scientific Computing}, - volume={14}, - year={2011} + @InProceedings{ mckinney-proc-scipy-2010, + author = { {W}es {M}c{K}inney }, + title = { {D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython }, + booktitle = { {P}roceedings of the 9th {P}ython in {S}cience {C}onference }, + pages = { 56 - 61 }, + year = { 2010 }, + editor = { {S}t\'efan van der {W}alt and {J}arrod {M}illman }, + doi = { 10.25080/Majora-92bf1922-00a } } ## Brand and logo From 1c29cee5f15b280b94a2d61202567646fd9ebc99 Mon Sep 17 00:00:00 2001 From: clement Date: Wed, 15 Apr 2020 16:41:51 +0100 Subject: [PATCH 3/3] Remove MultiIndexing from exceptions in 'validate_rst_title_capitalization.py' and change 'Multiindexing' that was by the way wrongly capitalized to 'Multi-indexing' in doc/source/whatsnew/v0.14.0.rst line 476 --- doc/source/whatsnew/v0.14.0.rst | 4 ++-- scripts/validate_rst_title_capitalization.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 9bdb1ecb544b7..847a42b3a7643 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -473,8 +473,8 @@ Some other enhancements to the sql functions include: .. _whatsnew_0140.slicers: -Multiindexing using slicers -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Multi-indexing using slicers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In 0.14.0 we added a new way to slice MultiIndexed objects. You can slice a MultiIndex by providing multiple indexers. diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 1281762c90496..9cf4922fa2662 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -42,7 +42,6 @@ "Arrow", "Parquet", "MultiIndex", - "MultiIndexing", "NumFOCUS", "sklearn", "Docker",