diff --git a/.binstar.yml b/.binstar.yml deleted file mode 100644 index 7b507b4f90049..0000000000000 --- a/.binstar.yml +++ /dev/null @@ -1,28 +0,0 @@ -package: pandas -user: jreback - -install: - - conda config --add channels pandas - -before_script: - - python -V - -platform: - - linux-64 - #- linux-32 - - osx-64 - #- win-32 - - win-64 -engine: - - python=2.7 - - python=3.4 -script: - - conda build conda.recipe --quiet - -iotimeout: 600 - -build_targets: conda - -notifications: - email: - recipients: ['jeff@reback.net'] diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 944ce9b4fb1f6..27dfded808b95 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,3 @@ custom: https://pandas.pydata.org/donate.html +github: [numfocus] tidelift: pypi/pandas diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml new file mode 100644 index 0000000000000..019ecfc484ca5 --- /dev/null +++ b/.github/workflows/assign.yml @@ -0,0 +1,15 @@ +name: Assign +on: + issue_comment: + types: created + +jobs: + one: + runs-on: ubuntu-latest + steps: + - name: + run: | + if [[ "${{ github.event.comment.body }}" == "take" ]]; then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000000..d87fa5203bd52 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,159 @@ +name: CI + +on: + push: + branches: master + pull_request: + branches: master + +env: + ENV_FILE: environment.yml + +jobs: + checks: + name: Checks + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "::add-path::${HOME}/miniconda3/bin" + + - name: Checkout + uses: actions/checkout@v1 + + - name: Looking for unwanted patterns + run: ci/code_checks.sh patterns + if: always() + + - name: Setup environment and build pandas + run: ci/setup_env.sh + if: always() + + - name: Linting + run: | + source activate pandas-dev + ci/code_checks.sh lint + if: always() + + - name: Dependencies consistency + run: | + source activate pandas-dev + ci/code_checks.sh dependencies + if: always() + + - name: Checks on imported code + run: | + source activate pandas-dev + ci/code_checks.sh code + if: always() + + - name: Running doctests + run: | + source activate pandas-dev + ci/code_checks.sh doctests + if: always() + + - name: Docstring validation + run: | + source activate pandas-dev + ci/code_checks.sh docstrings + if: always() + + - name: Typing validation + run: | + source activate pandas-dev + ci/code_checks.sh typing + if: always() + + - name: Testing docstring validation script + run: | + source activate pandas-dev + pytest --capture=no --strict scripts + if: always() + + - name: Running benchmarks + run: | + source activate pandas-dev + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + asv machine --yes + asv dev | sed "/failed$/ s/^/##[error]/" | tee benchmarks.log + if grep "failed" benchmarks.log > /dev/null ; then + exit 1 + fi + else + echo "Benchmarks did not run, no changes detected" + fi + if: always() + + - name: Publish benchmarks artifact + uses: actions/upload-artifact@master + with: + name: Benchmarks log + path: asv_bench/benchmarks.log + if: failure() + + web_and_docs: + name: Web and docs + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Build website + run: | + source activate pandas-dev + python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: | + source activate pandas-dev + doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} + + # This can be removed when the ipython directive fails when there are errors, + # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) + - name: Check ipython directive errors + run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + + - name: Merge website and docs + run: | + mkdir -p pandas_web/docs + cp -r web/build/* pandas_web/ + cp -r doc/build/html/* pandas_web/docs/ + if: github.event_name == 'push' + + - name: Install Rclone + run: sudo apt install rclone -y + if: github.event_name == 'push' + + - name: Set up Rclone + run: | + RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf + mkdir -p `dirname $RCLONE_CONFIG_PATH` + echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH + echo "type = swift" >> $RCLONE_CONFIG_PATH + echo "env_auth = false" >> $RCLONE_CONFIG_PATH + echo "auth_version = 3" >> $RCLONE_CONFIG_PATH + echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH + echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH + echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH + echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH + echo "domain = default" >> $RCLONE_CONFIG_PATH + echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH + echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH + echo "region = BHS" >> $RCLONE_CONFIG_PATH + if: github.event_name == 'push' + + - name: Sync web + run: rclone sync pandas_web ovh_cloud_pandas_web:dev + if: github.event_name == 'push' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f98273a336cf..809764a20a713 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: stable + rev: 19.10b0 hooks: - id: black language_version: python3.7 @@ -9,10 +9,22 @@ repos: hooks: - id: flake8 language: python_venv - additional_dependencies: [flake8-comprehensions] + additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 + rev: v4.3.21 hooks: - id: isort language: python_venv exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.730 + hooks: + - id: mypy + # We run mypy over all files because of: + # * changes in type definitions may affect non-touched files. + # * Running it with `mypy pandas` and the filenames will lead to + # spurious duplicate module errors, + # see also https://github.com/pre-commit/mirrors-mypy/issues/5 + pass_filenames: false + args: + - pandas diff --git a/.travis.yml b/.travis.yml index b9fa06304d387..a11cd469e9b9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: python -python: 3.5 +python: 3.7 # To turn off cached cython files and compiler cache # set NOCACHE-true @@ -30,33 +30,29 @@ matrix: - python: 3.5 include: - - dist: bionic - # 18.04 - python: 3.8-dev - env: - - JOB="3.8-dev" PATTERN="(not slow and not network)" + - env: + - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)" - - dist: trusty - env: + - env: - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)" - - dist: trusty - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" + - env: + - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + services: + - mysql + - postgresql - - dist: trusty - env: - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true + - env: + - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + services: + - mysql + - postgresql - # In allow_failures - - dist: trusty - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - - allow_failures: - - dist: trusty - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" + - env: + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" + services: + - mysql + - postgresql before_install: - echo "before_install" @@ -85,19 +81,10 @@ install: - ci/submit_cython_cache.sh - echo "install done" - -before_script: - # display server (for clipboard functionality) needs to be started here, - # does not work if done in install:setup_env.sh (GH-26103) - - export DISPLAY=":99.0" - - echo "sh -e /etc/init.d/xvfb start" - - if [ "$JOB" != "3.8-dev" ]; then sh -e /etc/init.d/xvfb start; fi - - sleep 3 - script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.8-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/LICENSES/MSGPACK_LICENSE b/LICENSES/MSGPACK_LICENSE deleted file mode 100644 index ae1b0f2f32f06..0000000000000 --- a/LICENSES/MSGPACK_LICENSE +++ /dev/null @@ -1,13 +0,0 @@ -Copyright (C) 2008-2011 INADA Naoki - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/LICENSES/MSGPACK_NUMPY_LICENSE b/LICENSES/MSGPACK_NUMPY_LICENSE deleted file mode 100644 index e570011efac73..0000000000000 --- a/LICENSES/MSGPACK_NUMPY_LICENSE +++ /dev/null @@ -1,33 +0,0 @@ -.. -*- rst -*- - -License -======= - -Copyright (c) 2013, Lev Givon. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. -* Neither the name of Lev Givon nor the names of any - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in index adaad1dc1c864..cf6a1835433a4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -20,7 +20,6 @@ global-exclude *.gz global-exclude *.h5 global-exclude *.html global-exclude *.json -global-exclude *.msgpack global-exclude *.pickle global-exclude *.png global-exclude *.pyc diff --git a/Makefile b/Makefile index 27a2c3682de9c..f26689ab65ba5 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 black: - black . --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . develop: build python -m pip install --no-build-isolation -e . diff --git a/README.md b/README.md index c299241722b7e..1130eb30954dc 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ Here are just a few of the things that pandas does well: and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, - moving window linear regressions, date shifting and lagging, etc. + date shifting and lagging. [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data @@ -164,12 +164,11 @@ pip install pandas ``` ## Dependencies -- [NumPy](https://www.numpy.org): 1.13.3 or higher -- [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher -- [pytz](https://pythonhosted.org/pytz): 2015.4 or higher +- [NumPy](https://www.numpy.org) +- [python-dateutil](https://labix.org/python-dateutil) +- [pytz](https://pythonhosted.org/pytz) -See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) -for recommended and optional dependencies. +See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies. ## Installation from sources To install pandas from source you need Cython in addition to the normal @@ -190,7 +189,7 @@ or for installing in [development mode](https://pip.pypa.io/en/latest/reference/ ```sh -python -m pip install --no-build-isolation -e . +python -m pip install -e . --no-build-isolation --no-use-pep517 ``` If you have `make`, you can also use `make develop` to run the same command. diff --git a/RELEASE.md b/RELEASE.md index efd075dabcba9..7924ffaff561f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,5 +2,5 @@ Release Notes ============= The list of changes to Pandas between each release can be found -[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full +[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c04bbf53a86a6..cd1a31d4eaf34 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -7,7 +7,7 @@ "project": "pandas", // The project's homepage - "project_url": "http://pandas.pydata.org/", + "project_url": "https://pandas.pydata.org/", // The URL of the source code repository for the project being // benchmarked @@ -122,5 +122,8 @@ ".*": "0409521665" }, "regression_thresholds": { - } + }, + "build_command": + ["python setup.py build -j4", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7d97f2c740acb..0f3b3838de1b2 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,8 @@ from pandas._libs import lib import pandas as pd -from pandas.util import testing as tm + +from .pandas_vb_common import tm for imp in ["pandas.util", "pandas.tools.hashing"]: try: diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py new file mode 100644 index 0000000000000..8cbf8c8592661 --- /dev/null +++ b/asv_bench/benchmarks/array.py @@ -0,0 +1,23 @@ +import numpy as np + +import pandas as pd + + +class BooleanArray: + def setup(self): + self.values_bool = np.array([True, False, True, False]) + self.values_float = np.array([1.0, 0.0, 1.0, 0.0]) + self.values_integer = np.array([1, 0, 1, 0]) + self.values_integer_like = [1, 0, 1, 0] + + def time_from_bool_array(self): + pd.array(self.values_bool, dtype="boolean") + + def time_from_integer_array(self): + pd.array(self.values_integer, dtype="boolean") + + def time_from_integer_like(self): + pd.array(self.values_integer_like, dtype="boolean") + + def time_from_float_array(self): + pd.array(self.values_float, dtype="boolean") diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 58e0db67d6025..64e067d25a454 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,3 +1,5 @@ +import operator + import numpy as np from pandas import DataFrame, Series, date_range @@ -9,6 +11,36 @@ import pandas.computation.expressions as expr +class IntFrameWithScalar: + params = [ + [np.float64, np.int64], + [2, 3.0, np.int32(4), np.float64(5)], + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.pow, + operator.mod, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ] + param_names = ["dtype", "scalar", "op"] + + def setup(self, dtype, scalar, op): + arr = np.random.randn(20000, 100) + self.df = DataFrame(arr.astype(dtype)) + + def time_frame_op_with_scalar(self, dtype, scalar, op): + op(self.df, scalar) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/asv_bench/benchmarks/boolean.py b/asv_bench/benchmarks/boolean.py new file mode 100644 index 0000000000000..71c422c641775 --- /dev/null +++ b/asv_bench/benchmarks/boolean.py @@ -0,0 +1,32 @@ +import numpy as np + +import pandas as pd + + +class TimeLogicalOps: + def setup(self): + N = 10_000 + left, right, lmask, rmask = np.random.randint(0, 2, size=(4, N)).astype("bool") + self.left = pd.arrays.BooleanArray(left, lmask) + self.right = pd.arrays.BooleanArray(right, rmask) + + def time_or_scalar(self): + self.left | True + self.left | False + + def time_or_array(self): + self.left | self.right + + def time_and_scalar(self): + self.left & True + self.left & False + + def time_and_array(self): + self.left & self.right + + def time_xor_scalar(self): + self.left ^ True + self.left ^ False + + def time_xor_array(self): + self.left ^ self.right diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 4384ccb7fa8b3..1dcd52ac074a6 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.api.types import union_categoricals @@ -14,21 +15,6 @@ pass -class Concat: - def setup(self): - N = 10 ** 5 - self.s = pd.Series(list("aabbcd") * N).astype("category") - - self.a = pd.Categorical(list("aabbcd") * N) - self.b = pd.Categorical(list("bbcdjk") * N) - - def time_concat(self): - pd.concat([self.s, self.s]) - - def time_union(self): - union_categoricals([self.a, self.b]) - - class Constructor: def setup(self): N = 10 ** 5 @@ -77,6 +63,33 @@ def time_existing_series(self): pd.Categorical(self.series) +class CategoricalOps: + params = ["__lt__", "__le__", "__eq__", "__ne__", "__ge__", "__gt__"] + param_names = ["op"] + + def setup(self, op): + N = 10 ** 5 + self.cat = pd.Categorical(list("aabbcd") * N, ordered=True) + + def time_categorical_op(self, op): + getattr(self.cat, op)("b") + + +class Concat: + def setup(self): + N = 10 ** 5 + self.s = pd.Series(list("aabbcd") * N).astype("category") + + self.a = pd.Categorical(list("aabbcd") * N) + self.b = pd.Categorical(list("bbcdjk") * N) + + def time_concat(self): + pd.concat([self.s, self.s]) + + def time_union(self): + union_categoricals([self.a, self.b]) + + class ValueCounts: params = [True, False] @@ -84,7 +97,7 @@ class ValueCounts: def setup(self, dropna): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_value_counts(self, dropna): @@ -102,7 +115,7 @@ def time_rendering(self): class SetCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_set_categories(self): @@ -112,7 +125,7 @@ def time_set_categories(self): class RemoveCategories: def setup(self): n = 5 * 10 ** 5 - arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + arr = [f"s{i:04d}" for i in np.random.randint(0, n // 10, size=n)] self.ts = pd.Series(arr).astype("category") def time_remove_categories(self): @@ -164,9 +177,9 @@ def setup(self, dtype): np.random.seed(1234) n = 5 * 10 ** 5 sample_size = 100 - arr = [i for i in np.random.randint(0, n // 10, size=n)] + arr = list(np.random.randint(0, n // 10, size=n)) if dtype == "object": - arr = ["s{:04d}".format(i) for i in arr] + arr = [f"s{i:04d}" for i in arr] self.sample = np.random.choice(arr, sample_size) self.series = pd.Series(arr).astype("category") @@ -225,7 +238,7 @@ def setup(self, index): elif index == "non_monotonic": self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) else: - raise ValueError("Invalid index param: {}".format(index)) + raise ValueError(f"Invalid index param: {index}") self.scalar = 10000 self.list = list(range(10000)) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index a9e45cad22d27..7c43485f5ef45 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp -import pandas.util.testing as tm + +from .pandas_vb_common import tm def no_change(arr): diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 24cc1c6f9fa70..bd17b710b108d 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -5,6 +5,7 @@ from .pandas_vb_common import ( datetime_dtypes, extension_dtypes, + lib, numeric_dtypes, string_dtypes, ) @@ -40,4 +41,25 @@ def time_pandas_dtype_invalid(self, dtype): pass +class InferDtypes: + param_names = ["dtype"] + data_dict = { + "np-object": np.array([1] * 100000, dtype="O"), + "py-object": [1] * 100000, + "np-null": np.array([1] * 50000 + [np.nan] * 50000), + "py-null": [1] * 50000 + [None] * 50000, + "np-int": np.array([1] * 100000, dtype=int), + "np-floating": np.array([1.0] * 100000, dtype=float), + "empty": [], + "bytes": [b"a"] * 100000, + } + params = list(data_dict.keys()) + + def time_infer_skipna(self, dtype): + lib.infer_dtype(self.data_dict[dtype], skipna=True) + + def time_infer(self, dtype): + lib.infer_dtype(self.data_dict[dtype], skipna=False) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 3944e0bc523d8..2b24bab85bc57 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.tseries.offsets import Nano, Hour @@ -99,10 +100,22 @@ class FromLists: def setup(self): N = 1000 M = 100 - self.data = [[j for j in range(M)] for i in range(N)] + self.data = [list(range(M)) for i in range(N)] def time_frame_from_lists(self): self.df = DataFrame(self.data) +class FromRange: + + goal_time = 0.2 + + def setup(self): + N = 1_000_000 + self.data = range(N) + + def time_frame_from_range(self): + self.df = DataFrame(self.data) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index eb9a0e83271f1..2187668c96ca4 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -4,7 +4,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class GetNumericData: @@ -321,10 +322,9 @@ class Dropna: def setup(self, how, axis): self.df = DataFrame(np.random.randn(10000, 1000)) - with warnings.catch_warnings(record=True): - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan + self.df.iloc[50:1000, 20:50] = np.nan + self.df.iloc[2000:3000] = np.nan + self.df.iloc[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed["foo"] = "bar" @@ -342,10 +342,9 @@ class Count: def setup(self, axis): self.df = DataFrame(np.random.randn(10000, 1000)) - with warnings.catch_warnings(record=True): - self.df.ix[50:1000, 20:50] = np.nan - self.df.ix[2000:3000] = np.nan - self.df.ix[:, 60:70] = np.nan + self.df.iloc[50:1000, 20:50] = np.nan + self.df.iloc[2000:3000] = np.nan + self.df.iloc[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed["foo"] = "bar" @@ -565,7 +564,7 @@ def setup(self): def time_frame_get_dtype_counts(self): with warnings.catch_warnings(record=True): - self.df.get_dtype_counts() + self.df._data.get_dtype_counts() def time_info(self): self.df.info() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index d57492dd37268..e266d871f5bc6 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -2,7 +2,8 @@ from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import ( @@ -24,7 +25,7 @@ except ImportError: from pandas import algos try: - from pandas.util.testing import test_parallel + from pandas._testing import test_parallel have_real_test_parallel = True except ImportError: @@ -37,7 +38,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO # noqa: E402 isort:skip +from .pandas_vb_common import BaseIO # isort:skip class ParallelGroupbyMethods: @@ -250,13 +251,11 @@ def setup(self, dtype): np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) ), "object": DataFrame( - "foo", - index=range(rows), - columns=["object%03d".format(i) for i in range(5)], + "foo", index=range(rows), columns=["object%03d" for _ in range(5)] ), } - self.fname = "__test_{}__.csv".format(dtype) + self.fname = f"__test_{dtype}__.csv" df = data[dtype] df.to_csv(self.fname) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d51c53e2264f1..28e0dcc5d9b13 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -13,7 +13,8 @@ date_range, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm method_blacklist = { "object": { diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index a94960d494707..103141545504b 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -7,11 +7,13 @@ Float64Index, Index, IntervalIndex, + MultiIndex, RangeIndex, Series, date_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SetOperations: @@ -111,6 +113,18 @@ def time_get_loc_dec(self): self.idx_dec.get_loc(100000) +class IndexEquals: + def setup(self): + idx_large_fast = RangeIndex(100000) + idx_small_slow = date_range(start="1/1/2012", periods=1) + self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) + + self.idx_non_object = RangeIndex(1) + + def time_non_object_equals_multiindex(self): + self.idx_non_object.equals(self.mi_large_slow) + + class IndexAppend: def setup(self): @@ -146,7 +160,7 @@ class Indexing: def setup(self, dtype): N = 10 ** 6 - self.idx = getattr(tm, "make{}Index".format(dtype))(N) + self.idx = getattr(tm, f"make{dtype}Index")(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index ac35139c1954a..087fe3916845b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -17,7 +17,8 @@ option_context, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class NumericSeriesIndexing: @@ -67,22 +68,6 @@ def time_iloc_scalar(self, index, index_structure): def time_iloc_slice(self, index, index_structure): self.data.iloc[:800000] - def time_ix_array(self, index, index_structure): - with warnings.catch_warnings(record=True): - self.data.ix[self.array] - - def time_ix_list_like(self, index, index_structure): - with warnings.catch_warnings(record=True): - self.data.ix[[800000]] - - def time_ix_scalar(self, index, index_structure): - with warnings.catch_warnings(record=True): - self.data.ix[800000] - - def time_ix_slice(self, index, index_structure): - with warnings.catch_warnings(record=True): - self.data.ix[:800000] - def time_loc_array(self, index, index_structure): self.data.loc[self.array] @@ -147,10 +132,7 @@ def setup(self): self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) - - def time_ix(self): - with warnings.catch_warnings(record=True): - self.df.ix[self.idx_scalar, self.col_scalar] + self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean") def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -164,6 +146,9 @@ def time_boolean_rows(self): def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] + def time_boolean_rows_boolean(self): + self.df[self.boolean_indexer] + class DataFrameNumericIndexing: def setup(self): @@ -228,14 +213,6 @@ def setup(self): self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() - def time_series_ix(self): - with warnings.catch_warnings(record=True): - self.s.ix[999] - - def time_frame_ix(self): - with warnings.catch_warnings(record=True): - self.df.ix[999] - def time_index_slice(self): self.mdt.loc[self.idx, :] @@ -310,10 +287,6 @@ def setup_cache(self): def time_lookup_iloc(self, s): s.iloc - def time_lookup_ix(self, s): - with warnings.catch_warnings(record=True): - s.ix - def time_lookup_loc(self, s): s.loc diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index e85b3bd2c7687..1a8d5ede52512 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Series, to_numeric -import pandas.util.testing as tm -from .pandas_vb_common import lib, numeric_dtypes +from .pandas_vb_common import lib, numeric_dtypes, tm class NumericInferOps: diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9b8599b0a1b64..9bcd125f56bbb 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -5,9 +5,8 @@ import numpy as np from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ToCSV(BaseIO): @@ -132,7 +131,7 @@ class ReadCSVConcatDatetimeBadDateValue(StringIORewind): param_names = ["bad_date_value"] def setup(self, bad_date_value): - self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000) + self.StringIO_input = StringIO((f"{bad_date_value},\n") * 50000) def time_read_csv(self, bad_date_value): read_csv( @@ -202,7 +201,7 @@ def setup(self, sep, thousands): data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: - fmt = ":{}".format(thousands) + fmt = f":{thousands}" fmt = "{" + fmt + "}" df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) @@ -231,7 +230,7 @@ def setup(self, sep, decimal, float_precision): floats = [ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) ] - rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n" + rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) @@ -309,9 +308,7 @@ class ReadCSVCachedParseDates(StringIORewind): param_names = ["do_cache"] def setup(self, do_cache): - data = ( - "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n" - ) * 10 + data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): @@ -336,7 +333,7 @@ class ReadCSVMemoryGrowth(BaseIO): def setup(self): with open(self.fname, "w") as f: for i in range(self.num_rows): - f.write("{i}\n".format(i=i)) + f.write(f"{i}\n") def mem_parser_chunks(self): # see gh-24805. diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index c97cf768e27d9..80af2cff41769 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -6,7 +6,8 @@ from odf.text import P from pandas import DataFrame, ExcelWriter, date_range, read_excel -import pandas.util.testing as tm + +from ..pandas_vb_common import tm def _generate_dataframe(): @@ -14,7 +15,7 @@ def _generate_dataframe(): C = 5 df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index b78dc63d17130..4ca399a293a4b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, HDFStore, date_range, read_hdf -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class HDFStoreDataFrame(BaseIO): @@ -115,7 +114,7 @@ def setup(self, format): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 5c1d39776b91c..f478bf2aee0ba 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ReadJSON(BaseIO): @@ -20,7 +19,7 @@ def setup(self, orient, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient=orient) @@ -43,7 +42,7 @@ def setup(self, index): } df = DataFrame( np.random.randn(N, 5), - columns=["float_{}".format(i) for i in range(5)], + columns=[f"float_{i}" for i in range(5)], index=indexes[index], ) df.to_json(self.fname, orient="records", lines=True) @@ -132,6 +131,30 @@ def peakmem_to_json_wide(self, orient, frame): df.to_json(self.fname, orient=orient) +class ToJSONISO(BaseIO): + fname = "__test__.json" + params = [["split", "columns", "index", "values", "records"]] + param_names = ["orient"] + + def setup(self, orient): + N = 10 ** 5 + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") + self.df = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + + def time_iso_format(self, orient): + self.df.to_json(orient=orient, date_format="iso") + + class ToJSONLines(BaseIO): fname = "__test__.json" diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py deleted file mode 100644 index f5038602539ab..0000000000000 --- a/asv_bench/benchmarks/io/msgpack.py +++ /dev/null @@ -1,32 +0,0 @@ -import warnings - -import numpy as np - -from pandas import DataFrame, date_range, read_msgpack -import pandas.util.testing as tm - -from ..pandas_vb_common import BaseIO - - -class MSGPack(BaseIO): - def setup(self): - self.fname = "__test__.msg" - N = 100000 - C = 5 - self.df = DataFrame( - np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), - ) - self.df["object"] = tm.makeStringIndex(N) - with warnings.catch_warnings(record=True): - self.df.to_msgpack(self.fname) - - def time_read_msgpack(self): - read_msgpack(self.fname) - - def time_write_msgpack(self): - self.df.to_msgpack(self.fname) - - -from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 647e9d27dec9d..4ca9a82ae4827 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_pickle -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Pickle(BaseIO): @@ -13,7 +12,7 @@ def setup(self): C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(N) diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 7ce8ef8c12639..5eaeb231b031b 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -26,5 +26,5 @@ def setup(self, format): ] self.f = os.path.join(*paths) - def time_read_msgpack(self, format): + def time_read_sas(self, format): read_sas(self.f, format=format) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index fe84c869717e3..b71bb832280b9 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -4,7 +4,8 @@ from sqlalchemy import create_engine from pandas import DataFrame, date_range, read_sql_query, read_sql_table -import pandas.util.testing as tm + +from ..pandas_vb_common import tm class SQL: @@ -19,7 +20,7 @@ def setup(self, connection): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_all = "SELECT * FROM {}".format(self.table_name) + self.query_all = f"SELECT * FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { @@ -58,7 +59,7 @@ def setup(self, connection, dtype): "sqlite": sqlite3.connect(":memory:"), } self.table_name = "test_type" - self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name) + self.query_col = f"SELECT {dtype} FROM {self.table_name}" self.con = con[connection] self.df = DataFrame( { diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 28829785d72e9..9faafa82ff46e 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_stata -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Stata(BaseIO): @@ -17,7 +16,7 @@ def setup(self, convert_dates): C = self.C = 5 self.df = DataFrame( np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], + columns=[f"float{i}" for i in range(C)], index=date_range("20000101", periods=N, freq="H"), ) self.df["object"] = tm.makeStringIndex(self.N) @@ -47,7 +46,7 @@ def setup(self, convert_dates): for i in range(10): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan - self.df["missing_{0}".format(i)] = missing_data + self.df[f"missing_{i}"] = missing_data self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 5cf9f6336ba0c..1333b3a0f0560 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import merge_ordered diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 3f4fd7ad911c1..0e188c58012fa 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -2,8 +2,9 @@ import numpy as np -from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +from pandas import DataFrame, MultiIndex, RangeIndex, date_range + +from .pandas_vb_common import tm class GetLoc: @@ -147,4 +148,16 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) +class Equals: + def setup(self): + idx_large_fast = RangeIndex(100000) + idx_small_slow = date_range(start="1/1/2012", periods=1) + self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) + + self.idx_non_object = RangeIndex(1) + + def time_equals_non_object_index(self): + self.mi_large_slow.equals(self.idx_non_object) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index a960f43f46acd..77ce1b2763bce 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -3,7 +3,7 @@ import pandas as pd try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 1faf13329110d..6da2b2270c04a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,6 +13,13 @@ except (ImportError, TypeError, ValueError): pass +# Compatibility import for the testing module +try: + import pandas._testing as tm # noqa +except ImportError: + import pandas.util.testing as tm # noqa + + numeric_dtypes = [ np.int64, np.int32, diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index cd450f801c805..03394e6fe08cb 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range -import pandas.util.testing as tm -from .pandas_vb_common import lib +from .pandas_vb_common import lib, tm class Reindex: diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a3f1d92545c3f..57c625ced8a43 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,7 +3,8 @@ import numpy as np from pandas import NaT, Series, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SeriesConstructor: diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ed5ebfa61594e..ec67394e55a1e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -7,20 +7,14 @@ class FrameOps: - params = [ops, ["float", "int"], [0, 1], [True, False]] - param_names = ["op", "dtype", "axis", "use_bottleneck"] + params = [ops, ["float", "int"], [0, 1]] + param_names = ["op", "dtype", "axis"] - def setup(self, op, dtype, axis, use_bottleneck): + def setup(self, op, dtype, axis): df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) - try: - pd.options.compute.use_bottleneck = use_bottleneck - except TypeError: - from pandas.core import nanops - - nanops._USE_BOTTLENECK = use_bottleneck self.df_func = getattr(df, op) - def time_op(self, op, dtype, axis, use_bottleneck): + def time_op(self, op, dtype, axis): self.df_func(axis=axis) @@ -46,20 +40,14 @@ def time_op(self, level, op): class SeriesOps: - params = [ops, ["float", "int"], [True, False]] - param_names = ["op", "dtype", "use_bottleneck"] + params = [ops, ["float", "int"]] + param_names = ["op", "dtype"] - def setup(self, op, dtype, use_bottleneck): + def setup(self, op, dtype): s = pd.Series(np.random.randn(100000)).astype(dtype) - try: - pd.options.compute.use_bottleneck = use_bottleneck - except TypeError: - from pandas.core import nanops - - nanops._USE_BOTTLENECK = use_bottleneck self.s_func = getattr(s, op) - def time_op(self, op, dtype, use_bottleneck): + def time_op(self, op, dtype): self.s_func() @@ -101,61 +89,49 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [["spearman", "kendall", "pearson"], [True, False]] - param_names = ["method", "use_bottleneck"] + params = [["spearman", "kendall", "pearson"]] + param_names = ["method"] - def setup(self, method, use_bottleneck): - try: - pd.options.compute.use_bottleneck = use_bottleneck - except TypeError: - from pandas.core import nanops + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(500, 15)) + self.df2 = pd.DataFrame(np.random.randn(500, 15)) + self.df_wide = pd.DataFrame(np.random.randn(500, 100)) + self.df_wide_nans = self.df_wide.where(np.random.random((500, 100)) < 0.9) + self.s = pd.Series(np.random.randn(500)) + self.s2 = pd.Series(np.random.randn(500)) - nanops._USE_BOTTLENECK = use_bottleneck - self.df = pd.DataFrame(np.random.randn(1000, 30)) - self.df2 = pd.DataFrame(np.random.randn(1000, 30)) - self.df_wide = pd.DataFrame(np.random.randn(1000, 200)) - self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9) - self.s = pd.Series(np.random.randn(1000)) - self.s2 = pd.Series(np.random.randn(1000)) - - def time_corr(self, method, use_bottleneck): + def time_corr(self, method): self.df.corr(method=method) - def time_corr_wide(self, method, use_bottleneck): + def time_corr_wide(self, method): self.df_wide.corr(method=method) - def time_corr_wide_nans(self, method, use_bottleneck): + def time_corr_wide_nans(self, method): self.df_wide_nans.corr(method=method) - def peakmem_corr_wide(self, method, use_bottleneck): + def peakmem_corr_wide(self, method): self.df_wide.corr(method=method) - def time_corr_series(self, method, use_bottleneck): + def time_corr_series(self, method): self.s.corr(self.s2, method=method) - def time_corrwith_cols(self, method, use_bottleneck): + def time_corrwith_cols(self, method): self.df.corrwith(self.df2, method=method) - def time_corrwith_rows(self, method, use_bottleneck): + def time_corrwith_rows(self, method): self.df.corrwith(self.df2, axis=1, method=method) class Covariance: - params = [[True, False]] - param_names = ["use_bottleneck"] - - def setup(self, use_bottleneck): - try: - pd.options.compute.use_bottleneck = use_bottleneck - except TypeError: - from pandas.core import nanops + params = [] + param_names = [] - nanops._USE_BOTTLENECK = use_bottleneck + def setup(self): self.s = pd.Series(np.random.randn(100000)) self.s2 = pd.Series(np.random.randn(100000)) - def time_cov_series(self, use_bottleneck): + def time_cov_series(self): self.s.cov(self.s2) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index f30b2482615bd..d7fb2775376c0 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, Series -import pandas.util.testing as tm + +from .pandas_vb_common import tm class Methods: diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 828134b80aa3d..37418d752f833 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -14,8 +14,8 @@ def setup(self): self.str_days = [] self.str_seconds = [] for i in self.ints: - self.str_days.append("{0} days".format(i)) - self.str_seconds.append("00:00:{0:02d}".format(i)) + self.str_days.append(f"{i} days") + self.str_seconds.append(f"00:00:{i:02d}") def time_convert_int(self): to_timedelta(self.ints, unit="s") @@ -34,7 +34,7 @@ class ToTimedeltaErrors: def setup(self, errors): ints = np.random.randint(0, 60, size=10000) - self.arr = ["{0} days".format(i) for i in ints] + self.arr = [f"{i} days" for i in ints] self.arr[-1] = "apple" def time_convert(self, errors): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 498774034d642..ba0b51922fd31 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -113,7 +113,7 @@ class InferFreq: def setup(self, freq): if freq is None: self.idx = date_range(start="1/1/1700", freq="D", periods=10000) - self.idx.freq = None + self.idx._data._freq = None else: self.idx = date_range(start="1/1/1700", freq=freq, periods=10000) diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index d6379b922641c..fc1efe63307b2 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -59,7 +59,7 @@ def setup(self, offset): def time_on_offset(self, offset): for date in self.dates: - offset.onOffset(date) + offset.is_on_offset(date) class OffestDatetimeArithmetic: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6fb8241d6d600..57032932b878c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -16,95 +16,6 @@ jobs: name: Windows vmImage: vs2017-win2016 -- job: 'Checks' - pool: - vmImage: ubuntu-16.04 - timeoutInMinutes: 90 - steps: - - script: | - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' - echo '##vso[task.setvariable variable=AZURE]true' - displayName: 'Setting environment variables' - - # Do not require a conda environment - - script: ci/code_checks.sh patterns - displayName: 'Looking for unwanted patterns' - condition: true - - - script: | - sudo apt-get update - sudo apt-get install -y libc6-dev-i386 - ci/setup_env.sh - displayName: 'Setup environment and build pandas' - condition: true - - # Do not require pandas - - script: | - source activate pandas-dev - ci/code_checks.sh lint - displayName: 'Linting' - condition: true - - - script: | - source activate pandas-dev - ci/code_checks.sh dependencies - displayName: 'Dependencies consistency' - condition: true - - # Require pandas - - script: | - source activate pandas-dev - ci/code_checks.sh code - displayName: 'Checks on imported code' - condition: true - - - script: | - source activate pandas-dev - ci/code_checks.sh doctests - displayName: 'Running doctests' - condition: true - - - script: | - source activate pandas-dev - ci/code_checks.sh docstrings - displayName: 'Docstring validation' - condition: true - - - script: | - source activate pandas-dev - ci/code_checks.sh typing - displayName: 'Typing validation' - condition: true - - - script: | - source activate pandas-dev - pytest --capture=no --strict scripts - displayName: 'Testing docstring validation script' - condition: true - - - script: | - source activate pandas-dev - cd asv_bench - asv check -E existing - git remote add upstream https://github.com/pandas-dev/pandas.git - git fetch upstream - if git diff upstream/master --name-only | grep -q "^asv_bench/"; then - asv machine --yes - ASV_OUTPUT="$(asv dev)" - if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then - echo "##vso[task.logissue type=error]Benchmarks run with errors" - echo "$ASV_OUTPUT" - exit 1 - else - echo "Benchmarks run without errors" - fi - else - echo "Benchmarks did not run, no changes detected" - fi - displayName: 'Running benchmarks' - condition: true - - job: 'Web_and_Docs' pool: vmImage: ubuntu-16.04 diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 281107559a38c..55e8e839f4fae 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -9,29 +9,34 @@ jobs: strategy: matrix: ${{ if eq(parameters.name, 'macOS') }}: - py35_macos: - ENV_FILE: ci/deps/azure-macos-35.yaml - CONDA_PY: "35" + py36_macos: + ENV_FILE: ci/deps/azure-macos-36.yaml + CONDA_PY: "36" PATTERN: "not slow and not network" ${{ if eq(parameters.name, 'Linux') }}: - py35_compat: - ENV_FILE: ci/deps/azure-35-compat.yaml - CONDA_PY: "35" + py36_minimum_versions: + ENV_FILE: ci/deps/azure-36-minimum_versions.yaml + CONDA_PY: "36" PATTERN: "not slow and not network" py36_locale_slow_old_np: - ENV_FILE: ci/deps/azure-36-locale.yaml + ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" - py36_locale_slow: - ENV_FILE: ci/deps/azure-36-locale_slow.yaml + py36_locale: + ENV_FILE: ci/deps/azure-36-locale.yaml CONDA_PY: "36" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "it_IT.UTF-8" + LANG: "it_IT.utf8" + LC_ALL: "it_IT.utf8" + EXTRA_APT: "language-pack-it" py36_32bit: ENV_FILE: ci/deps/azure-36-32bit.yaml @@ -43,7 +48,9 @@ jobs: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" + EXTRA_APT: "language-pack-zh-hans" py37_np_dev: ENV_FILE: ci/deps/azure-37-numpydev.yaml @@ -55,10 +62,16 @@ jobs: steps: - script: | - if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - echo "Creating Environment" - ci/setup_env.sh + if [ "$(uname)" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y libc6-dev-i386 $EXTRA_APT + fi + displayName: 'Install extra packages' + + - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' + displayName: 'Set conda path' + + - script: ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | @@ -70,37 +83,13 @@ jobs: displayName: 'Build versions' - task: PublishTestResults@2 + condition: succeededOrFailed() inputs: - testResultsFiles: 'test-data-*.xml' + failTaskOnFailedTests: true + testResultsFiles: 'test-data.xml' testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} displayName: 'Publish test results' - - powershell: | - $junitXml = "test-data-single.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-single" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" - } - - $junitXmlMulti = "test-data-multiple.xml" - $(Get-Content $junitXmlMulti | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data-multi" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" - } - displayName: 'Check for test failures' - - script: | source activate pandas-dev python ci/print_skipped.py diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index dfa82819b9826..187a5db99802f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -11,49 +11,47 @@ jobs: py36_np15: ENV_FILE: ci/deps/azure-windows-36.yaml CONDA_PY: "36" + PATTERN: "not slow and not network" py37_np141: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" + PATTERN: "not slow and not network" steps: - powershell: | Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" displayName: 'Add conda to PATH' + - script: conda update -q -n base conda - displayName: Update conda - - script: | - call activate + displayName: 'Update conda' + + - bash: | conda env create -q --file ci\\deps\\azure-windows-$(CONDA_PY).yaml displayName: 'Create anaconda environment' - - script: | - call activate pandas-dev - call conda list - ci\\incremental\\build.cmd + + - bash: | + source activate pandas-dev + conda list + python setup.py build_ext -q -i -j 4 + python -m pip install --no-build-isolation -e . displayName: 'Build' - - script: | - call activate pandas-dev - pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* + + - bash: | + source activate pandas-dev + ci/run_tests.sh displayName: 'Test' + - task: PublishTestResults@2 + condition: succeededOrFailed() inputs: + failTaskOnFailedTests: true testResultsFiles: 'test-data.xml' - testRunTitle: 'Windows-$(CONDA_PY)' - - powershell: | - $junitXml = "test-data.xml" - $(Get-Content $junitXml | Out-String) -match 'failures="(.*?)"' - if ($matches[1] -eq 0) - { - Write-Host "No test failures in test-data" - } - else - { - # note that this will produce $LASTEXITCODE=1 - Write-Error "$($matches[1]) tests failed" - } - displayName: 'Check for test failures' - - script: | + testRunTitle: ${{ format('{0}-$(CONDA_PY)', parameters.name) }} + displayName: 'Publish test results' + + - bash: | source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' diff --git a/ci/build38.sh b/ci/build38.sh deleted file mode 100644 index 66eb5cad38475..0000000000000 --- a/ci/build38.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -e -# Special build for python3.8 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz cython pytest pytest-xdist hypothesis - -# Possible alternative for getting numpy: -pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy - -python setup.py build_ext -inplace -python -m pip install -v --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" - -# TODO: Is there anything else in setup_env that we really want to do? -# ci/setup_env.sh diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f704a1018c926..83ceb11dfcbf4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -34,17 +34,13 @@ function invgrep { # # This is useful for the CI, as we want to fail if one of the patterns # that we want to avoid is found by grep. - if [[ "$AZURE" == "true" ]]; then - set -o pipefail - grep -n "$@" | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Found unwanted pattern: " $3}' - else - grep "$@" - fi - return $((! $?)) + grep -n "$@" | sed "s/^/$INVGREP_PREPEND/" | sed "s/$/$INVGREP_APPEND/" ; EXIT_STATUS=${PIPESTATUS[0]} + return $((! $EXIT_STATUS)) } -if [[ "$AZURE" == "true" ]]; then - FLAKE8_FORMAT="##vso[task.logissue type=error;sourcepath=%(path)s;linenumber=%(row)s;columnnumber=%(col)s;code=%(code)s;]%(text)s" +if [[ "$GITHUB_ACTIONS" == "true" ]]; then + FLAKE8_FORMAT="##[error]%(path)s:%(row)s:%(col)s:%(code)s:%(text)s" + INVGREP_PREPEND="##[error]" else FLAKE8_FORMAT="default" fi @@ -56,7 +52,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then black --version MSG='Checking black formatting' ; echo $MSG - black . --check --exclude '(asv_bench/env|\.egg|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist|setup.py)' + black . --check RET=$(($RET + $?)) ; echo $MSG "DONE" # `setup.cfg` contains the list of error codes that are being ignored in flake8 @@ -98,18 +94,31 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # We don't lint all C files because we don't want to lint any that are built # from Cython files nor do we want to lint C files that we didn't modify for - # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, + # this particular codebase (e.g. src/headers, src/klib). However, # we can lint all header files since they aren't "generated" like C files are. MSG='Linting .c and .h' ; echo $MSG - cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/io/msgpack pandas/_libs/*.cpp pandas/util + cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of not concatenated strings' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . + else + $BASE_DIR/scripts/validate_string_concatenation.py . + fi RET=$(($RET + $?)) ; echo $MSG "DONE" echo "isort --version-number" isort --version-number # Imports - Check formatting using isort see setup.cfg for settings - MSG='Check import format using isort ' ; echo $MSG - isort --recursive --check-only pandas asv_bench + MSG='Check import format using isort' ; echo $MSG + ISORT_CMD="isort --recursive --check-only pandas asv_bench" + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) + else + eval $ISORT_CMD + fi RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -120,9 +129,20 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # Check for imports from pandas.core.common instead of `import pandas.core.common as com` # Check for imports from collections.abc instead of `from collections import abc` MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" -E "from pandas.core.common import " pandas - invgrep -R --include="*.py*" -E "from collections.abc import " pandas - invgrep -R --include="*.py*" -E "from numpy import nan " pandas + invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + invgrep -R --include="*.py*" -E "from pandas.core import common" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + invgrep -R --include="*.py*" -E "from collections.abc import" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + invgrep -R --include="*.py*" -E "from numpy import nan" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # Checks for test suite + # Check for imports from pandas._testing instead of `import pandas._testing as tm` + invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests + RET=$(($RET + $?)) ; echo $MSG "DONE" + invgrep -R --include="*.py*" -E "from pandas.util import testing as tm" pandas/tests RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for use of exec' ; echo $MSG @@ -184,15 +204,31 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" ".. ipython ::" doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for extra blank lines after the class definition' ; echo $MSG + invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of {foo!r} instead of {repr(foo)}' ; echo $MSG + invgrep -R --include=*.{py,pyx} '!r}' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of comment-based annotation syntax' ; echo $MSG + invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG + invgrep -R --include=*.{py,pyx} '\.__class__' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of xrange instead of range' ; echo $MSG + invgrep -R --include=*.{py,pyx} 'xrange' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG - set -o pipefail - if [[ "$AZURE" == "true" ]]; then - # we exclude all c/cpp files as the c/cpp files of pandas code base are tested when Linting .c and .h files - ! grep -n '--exclude=*.'{svg,c,cpp,html,js} --exclude-dir=env -RI "\s$" * | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Tailing whitespaces found: " $3}' - else - ! grep -n '--exclude=*.'{svg,c,cpp,html,js} --exclude-dir=env -RI "\s$" * | awk -F ":" '{print $1 ":" $2 ":Tailing whitespaces found: " $3}' - fi + INVGREP_APPEND=" <- trailing whitespaces found" + invgrep -RI --exclude=\*.{svg,c,cpp,html,js} --exclude-dir=env "\s$" * RET=$(($RET + $?)) ; echo $MSG "DONE" + unset INVGREP_APPEND fi ### CODE ### @@ -262,8 +298,15 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/string_.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/string_.py + MSG='Doctests arrays'; echo $MSG + pytest -q --doctest-modules \ + pandas/core/arrays/string_.py \ + pandas/core/arrays/integer.py \ + pandas/core/arrays/boolean.py + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Doctests arrays/boolean.py' ; echo $MSG + pytest -q --doctest-modules pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 1e2e6c33e8c15..cf3fca307481f 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -3,21 +3,24 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - attrs=19.1.0 - gcc_linux-32 - - gcc_linux-32 - gxx_linux-32 - numpy=1.14.* - python-dateutil - - python=3.6.* - pytz=2017.2 - # universal - - pytest - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 + + # see comment above - pip - pip: - # Anaconda doesn't build a new enough Cython - cython>=0.29.13 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 76868f598f11b..810554632a507 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -3,28 +3,38 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4==4.6.0 - - bottleneck=1.2.* - - cython=0.29.13 + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-asyncio + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 + - gcsfs + - html5lib + - ipython + - jinja2 - lxml - - matplotlib=2.2.2 - - numpy=1.14.* - - openpyxl=2.4.8 + - matplotlib=3.0.* + - nomkl + - numexpr + - numpy=1.15.* + - openpyxl + # lowest supported version of pyarrow (putting it here instead of in + # azure-36-minimum_versions because it needs numpy >= 1.14) + - pyarrow=0.13 + - pytables - python-dateutil - - python-blosc - - python=3.6.* - - pytz=2017.2 + - pytz + - s3fs - scipy - - sqlalchemy=1.1.4 - - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - - pip - - pip: - - html5lib==1.0b2 + - xarray + - xlrd + - xlsxwriter + - xlwt + - moto diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 21205375204dc..48ac50c001715 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -3,34 +3,30 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 - - gcsfs - - html5lib - - ipython - - jinja2 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4=4.6.0 + - bottleneck=1.2.* - lxml - - matplotlib=3.0.* - - nomkl - - numexpr - - numpy=1.15.* - - openpyxl - - pytables + - matplotlib=2.2.2 + - numpy=1.14.* + - openpyxl=2.5.7 - python-dateutil - - python=3.6.* - - pytz - - s3fs + - python-blosc + - pytz=2017.2 - scipy - - xarray - - xlrd - - xlsxwriter - - xlwt - # universal - - pytest>=4.0.2 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - - moto + - sqlalchemy=1.1.4 + - xlrd=1.1.0 + - xlsxwriter=0.9.8 + - xlwt=1.2.0 - pip - pip: - - hypothesis>=3.58.0 + - html5lib==1.0b2 diff --git a/ci/deps/azure-35-compat.yaml b/ci/deps/azure-36-minimum_versions.yaml similarity index 58% rename from ci/deps/azure-35-compat.yaml rename to ci/deps/azure-36-minimum_versions.yaml index dd54001984ec7..de7e011d9c7ca 100644 --- a/ci/deps/azure-35-compat.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -3,28 +3,29 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.1 + + # tools + - cython=0.29.13 + - pytest=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + - psutil + + # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - jinja2=2.8 + - numba=0.46.0 - numexpr=2.6.2 - numpy=1.13.3 - - openpyxl=2.4.8 + - openpyxl=2.5.7 - pytables=3.4.2 - python-dateutil=2.6.1 - - python=3.5.3 - pytz=2017.2 - scipy=0.19.0 - xlrd=1.1.0 - xlsxwriter=0.9.8 - xlwt=1.2.0 - # universal - - hypothesis>=3.58.0 - - pytest-xdist - - pytest-mock - - pytest-azurepipelines - - pip - - pip: - # for python 3.5, pytest>=4.0.2, cython>=0.29.13 is not available in conda - - cython>=0.29.13 - - pytest==4.5.0 - - html5lib==1.0b2 + - html5lib=1.0.1 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 24464adb74f5b..111ba6b020bc7 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -1,10 +1,19 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.7.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - pytest-asyncio + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 - html5lib - ipython - jinja2 @@ -17,7 +26,6 @@ dependencies: - openpyxl - pytables - python-dateutil - - python=3.7.* - pytz - s3fs - scipy @@ -25,11 +33,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - pip - - pip: - - hypothesis>=3.58.0 + - pyarrow>=0.15 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 0fb06fd43724c..a04bdc2448bce 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -3,14 +3,16 @@ channels: - defaults dependencies: - python=3.7.* - - pytz - - Cython>=0.29.13 - # universal - # pytest < 5 until defaults has pytest-xdist>=1.29.0 - - pytest>=4.0.2,<5.0 - - pytest-xdist - - pytest-mock + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - pytz - pip - pip: - "git+git://github.com/dateutil/dateutil.git" @@ -18,5 +20,3 @@ dependencies: - "--pre" - "numpy" - "scipy" - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml deleted file mode 100644 index 4e0f09904b695..0000000000000 --- a/ci/deps/azure-macos-35.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: pandas-dev -channels: - - defaults -dependencies: - - beautifulsoup4 - - bottleneck - - html5lib - - jinja2 - - lxml - - matplotlib=2.2.3 - - nomkl - - numexpr - - numpy=1.13.3 - - openpyxl - - pyarrow - - pytables - - python=3.5.* - - python-dateutil==2.6.1 - - pytz - - xarray - - xlrd - - xlsxwriter - - xlwt - - pip - - pip: - # Anaconda / conda-forge don't build for 3.5 - - cython>=0.29.13 - - pyreadstat - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - # https://github.com/pandas-dev/pandas/issues/27421 - - pytest-azurepipelines<1.0.0 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml new file mode 100644 index 0000000000000..3bbbdb4cf32ad --- /dev/null +++ b/ci/deps/azure-macos-36.yaml @@ -0,0 +1,35 @@ +name: pandas-dev +channels: + - defaults +dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 + - bottleneck + - html5lib + - jinja2 + - lxml + - matplotlib=2.2.3 + - nomkl + - numexpr + - numpy=1.14 + - openpyxl + - pyarrow>=0.13.0 + - pytables + - python-dateutil==2.6.1 + - pytz + - xarray + - xlrd + - xlsxwriter + - xlwt + - pip + - pip: + - pyreadstat diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 88b38aaef237c..663c55492e69e 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -3,26 +3,30 @@ channels: - conda-forge - defaults dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - blosc - bottleneck - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - matplotlib=3.0.2 + - numba - numexpr - numpy=1.15.* - openpyxl - - pyarrow + - jinja2 + - pyarrow>=0.13.0 - pytables - python-dateutil - - python=3.6.* - pytz - scipy - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 7680ed9fd9c92..62be1075b3337 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -3,6 +3,16 @@ channels: - defaults - conda-forge dependencies: + - python=3.7.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies - beautifulsoup4 - bottleneck - gcsfs @@ -14,8 +24,8 @@ dependencies: - numexpr - numpy=1.14.* - openpyxl + - pyarrow=0.14 - pytables - - python=3.7.* - python-dateutil - pytz - s3fs @@ -24,11 +34,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - cython>=0.29.13 - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - pytest-azurepipelines - - hypothesis>=3.58.0 - pyreadstat diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index e4e917d13990c..a46001c58d165 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -3,11 +3,21 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-cov # this is only needed in the coverage build + + # pandas dependencies - beautifulsoup4 - botocore>=1.11 - cython>=0.29.13 - dask - - fastparquet>=0.2.1 + - fastparquet>=0.3.2 - gcsfs - geopandas - html5lib @@ -17,19 +27,16 @@ dependencies: - numexpr - numpy=1.15.* - odfpy - - openpyxl + - openpyxl<=3.0.1 + # https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke - pandas-gbq - # https://github.com/pydata/pandas-gbq/issues/271 - - google-cloud-bigquery<=1.11 - psycopg2 - # pyarrow segfaults on load: https://github.com/pandas-dev/pandas/issues/26716 - # - pyarrow=0.9.0 + - pyarrow>=0.13.0 - pymysql - pytables - python-snappy - - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scikit-learn - scipy - sqlalchemy @@ -38,12 +45,6 @@ dependencies: - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-cov - - pytest-mock - - hypothesis>=3.58.0 - pip - pip: - brotlipy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 44795766d7c31..d0bc046575953 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -3,11 +3,19 @@ channels: - defaults - conda-forge dependencies: + - python=3.6.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies - beautifulsoup4 - blosc=1.14.3 - python-blosc - - cython>=0.29.13 - - fastparquet=0.2.1 + - fastparquet=0.3.2 - gcsfs=0.2.2 - html5lib - ipython @@ -24,19 +32,11 @@ dependencies: - pymysql=0.7.11 - pytables - python-dateutil - - python=3.6.* - pytz - - s3fs=0.0.8 + - s3fs=0.3.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.1 - - pytest-xdist>=1.29.0 - - pytest-mock - - pip - - pip: - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index d54708d48a65e..1dfd90d0904ac 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -3,8 +3,16 @@ channels: - defaults - conda-forge dependencies: - - beautifulsoup4 + - python=3.6.* + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - beautifulsoup4 - html5lib - lxml - matplotlib @@ -16,17 +24,11 @@ dependencies: - pymysql - pytables - python-dateutil - - python=3.6.* - pytz - - s3fs<0.3 + - s3fs - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - moto - - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 440ca6c480b87..73e2c20b31438 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -5,20 +5,23 @@ channels: - c3i_test dependencies: - python=3.7.* - - botocore>=1.11 + + # tools - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - botocore>=1.11 - numpy - python-dateutil - nomkl - pyarrow - pytz - # universal - - pytest>=5.0.0 - - pytest-xdist>=1.29.0 - - pytest-mock - - hypothesis>=3.58.0 - - s3fs<0.3 - - pip + - s3fs + - tabulate - pyreadstat + - pip - pip: - moto diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml new file mode 100644 index 0000000000000..a627b7edc175f --- /dev/null +++ b/ci/deps/travis-38.yaml @@ -0,0 +1,20 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.8.* + + # tools + - cython>=0.29.13 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + + # pandas dependencies + - numpy + - python-dateutil + - nomkl + - pytz + - pip + - tabulate==0.8.3 diff --git a/ci/incremental/build.cmd b/ci/incremental/build.cmd deleted file mode 100644 index b61b59e287299..0000000000000 --- a/ci/incremental/build.cmd +++ /dev/null @@ -1,9 +0,0 @@ -@rem https://github.com/numba/numba/blob/master/buildscripts/incremental/build.cmd - -@rem Build extensions -python setup.py build_ext -q -i - -@rem Install pandas -python -m pip install --no-build-isolation -e . - -if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/ci/print_skipped.py b/ci/print_skipped.py index e99e789a71fe8..72822fa2d3c7f 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -5,12 +5,12 @@ def main(filename): if not os.path.isfile(filename): - return + raise RuntimeError(f"Could not find junit file {repr(filename)}") tree = et.parse(filename) root = tree.getroot() current_class = "" - for el in root.findall("testcase"): + for el in root.iter("testcase"): cn = el.attrib["classname"] for sk in el.findall("skipped"): old_class = current_class @@ -27,14 +27,12 @@ def main(filename): if __name__ == "__main__": print("SKIPPED TESTS:") i = 1 - for file_type in ("-single", "-multiple", ""): - for test_data in main("test-data{}.xml".format(file_type)): - if test_data is None: - print("-" * 80) - else: - print( - "#{i} {class_name}.{test_name}: {message}".format( - **dict(test_data, i=i) - ) - ) - i += 1 + for test_data in main("test-data.xml"): + if test_data is None: + print("-" * 80) + else: + print( + f"#{i} {test_data['class_name']}." + f"{test_data['test_name']}: {test_data['message']}" + ) + i += 1 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index d1a9447c97d4e..8020680d617d7 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,47 +5,28 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE" - export LANG="$LOCALE_OVERRIDE" - PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` - if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then - echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" - # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed - # exit 1 - fi -fi if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi - -if [ -n "$PATTERN" ]; then - PATTERN=" and $PATTERN" +if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/test_coverage.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" fi -for TYPE in single multiple -do - if [ "$COVERAGE" ]; then - COVERAGE_FNAME="/tmp/coc-$TYPE.xml" - COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" - fi +PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" - TYPE_PATTERN=$TYPE - NUM_JOBS=1 - if [[ "$TYPE_PATTERN" == "multiple" ]]; then - TYPE_PATTERN="not single" - NUM_JOBS=2 - fi +# Travis does not have have an X server +if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + DISPLAY=DISPLAY=:99.0 + PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD" +fi - PYTEST_CMD="pytest -m \"$TYPE_PATTERN$PATTERN\" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas" - echo $PYTEST_CMD - # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code - sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" +echo $PYTEST_CMD +sh -c "$PYTEST_CMD" - if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then - echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME - fi -done +if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then + echo "uploading coverage" + echo "bash <(curl -s https://codecov.io/bash) -Z -c -f $COVERAGE_FNAME" + bash <(curl -s https://codecov.io/bash) -Z -c -f $COVERAGE_FNAME +fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 4d454f9c5041a..db28eaea8956e 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,20 +1,15 @@ #!/bin/bash -e -if [ "$JOB" == "3.8-dev" ]; then - /bin/bash ci/build38.sh - exit 0 -fi - # edit the locale file if needed -if [ -n "$LOCALE_OVERRIDE" ]; then +if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" sed -i "$SEDC" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py echo - sudo locale-gen "$LOCALE_OVERRIDE" fi MINICONDA_DIR="$HOME/miniconda3" @@ -114,6 +109,11 @@ echo "w/o removing anything else" conda remove pandas -y --force || true pip uninstall -y pandas || true +echo +echo "remove postgres if has been installed with conda" +echo "we use the one from the CI" +conda remove postgresql -y --force || true + echo echo "conda list pandas" conda list pandas @@ -121,7 +121,7 @@ conda list pandas # Make sure any error below is reported as such echo "[Build extensions]" -python setup.py build_ext -q -i +python setup.py build_ext -q -i -j2 # XXX: Some of our environments end up with old versions of pip (10.x) # Adding a new enough version of pip to the requirements explodes the @@ -140,7 +140,8 @@ echo "conda list" conda list # Install DB for Linux -if [ "${TRAVIS_OS_NAME}" == "linux" ]; then + +if [[ -n ${SQL:0} ]]; then echo "installing dbs" mysql -e 'create database pandas_nosetest;' psql -c 'create database pandas_nosetest;' -U postgres diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index f92090fecccf3..47f63c11d0567 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,5 +36,5 @@ test: about: - home: http://pandas.pydata.org + home: https://pandas.pydata.org license: BSD diff --git a/doc/make.py b/doc/make.py index cbb1fa6a5324a..cf73f44b5dd02 100755 --- a/doc/make.py +++ b/doc/make.py @@ -60,7 +60,7 @@ def __init__( if single_doc and single_doc.endswith(".rst"): self.single_doc_html = os.path.splitext(single_doc)[0] + ".html" elif single_doc: - self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc) + self.single_doc_html = f"reference/api/pandas.{single_doc}.html" def _process_single_doc(self, single_doc): """ @@ -76,7 +76,7 @@ def _process_single_doc(self, single_doc): if os.path.exists(os.path.join(SOURCE_PATH, single_doc)): return single_doc else: - raise FileNotFoundError("File {} not found".format(single_doc)) + raise FileNotFoundError(f"File {single_doc} not found") elif single_doc.startswith("pandas."): try: @@ -84,17 +84,15 @@ def _process_single_doc(self, single_doc): for name in single_doc.split("."): obj = getattr(obj, name) except AttributeError: - raise ImportError("Could not import {}".format(single_doc)) + raise ImportError(f"Could not import {single_doc}") else: return single_doc[len("pandas.") :] else: raise ValueError( - ( - "--single={} not understood. Value should be a " - "valid path to a .rst or .ipynb file, or a " - "valid pandas object (e.g. categorical.rst or " - "pandas.DataFrame.head)" - ).format(single_doc) + f"--single={single_doc} not understood. " + "Value should be a valid path to a .rst or .ipynb file, " + "or a valid pandas object " + "(e.g. categorical.rst or pandas.DataFrame.head)" ) @staticmethod @@ -113,7 +111,7 @@ def _run_os(*args): """ subprocess.check_call(args, stdout=sys.stdout, stderr=sys.stderr) - def _sphinx_build(self, kind): + def _sphinx_build(self, kind: str): """ Call sphinx to build documentation. @@ -128,7 +126,7 @@ def _sphinx_build(self, kind): >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ if kind not in ("html", "latex"): - raise ValueError("kind must be html or latex, " "not {}".format(kind)) + raise ValueError(f"kind must be html or latex, not {kind}") cmd = ["sphinx-build", "-b", kind] if self.num_jobs: @@ -136,7 +134,7 @@ def _sphinx_build(self, kind): if self.warnings_are_errors: cmd += ["-W", "--keep-going"] if self.verbosity: - cmd.append("-{}".format("v" * self.verbosity)) + cmd.append(f"-{'v' * self.verbosity}") cmd += [ "-d", os.path.join(BUILD_PATH, "doctrees"), @@ -156,7 +154,7 @@ def _get_page_title(self, page): """ Open the rst file `page` and extract its title. """ - fname = os.path.join(SOURCE_PATH, "{}.rst".format(page)) + fname = os.path.join(SOURCE_PATH, f"{page}.rst") option_parser = docutils.frontend.OptionParser( components=(docutils.parsers.rst.Parser,) ) @@ -184,18 +182,6 @@ def _add_redirects(self): Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - html = """ - - - - - -

- The page has been moved to {title} -

- - - """ with open(REDIRECTS_FILE) as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: @@ -214,15 +200,23 @@ def _add_redirects(self): if os.path.exists(path): raise RuntimeError( - ("Redirection would overwrite an existing file: " "{}").format( - path - ) + f"Redirection would overwrite an existing file: {path}" ) with open(path, "w") as moved_page_fd: - moved_page_fd.write( - html.format(url="{}.html".format(row[1]), title=title) - ) + html = f"""\ + + + + + +

+ The page has been moved to {title} +

+ +""" + + moved_page_fd.write(html) def html(self): """ @@ -290,15 +284,14 @@ def zip_html(self): def main(): cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] + joined = ",".join(cmds) argparser = argparse.ArgumentParser( - description="pandas documentation builder", - epilog="Commands: {}".format(",".join(cmds)), + description="pandas documentation builder", epilog=f"Commands: {joined}", ) + + joined = ", ".join(cmds) argparser.add_argument( - "command", - nargs="?", - default="html", - help="command to run: {}".format(", ".join(cmds)), + "command", nargs="?", default="html", help=f"command to run: {joined}", ) argparser.add_argument( "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" @@ -312,10 +305,9 @@ def main(): type=str, default=None, help=( - 'filename (relative to the "source" folder)' - " of section or method name to compile, e.g. " - '"development/contributing.rst",' - ' "ecosystem.rst", "pandas.DataFrame.join"' + "filename (relative to the 'source' folder) of section or method name to " + "compile, e.g. 'development/contributing.rst', " + "'ecosystem.rst', 'pandas.DataFrame.join'" ), ) argparser.add_argument( @@ -340,11 +332,8 @@ def main(): args = argparser.parse_args() if args.command not in cmds: - raise ValueError( - "Unknown command {}. Available options: {}".format( - args.command, ", ".join(cmds) - ) - ) + joined = ", ".join(cmds) + raise ValueError(f"Unknown command {args.command}. Available options: {joined}") # Below we update both os.environ and sys.path. The former is used by # external libraries (namely Sphinx) to compile this module and resolve diff --git a/doc/redirects.csv b/doc/redirects.csv index a2146edde6324..0a71f037d23c3 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -319,7 +319,6 @@ generated/pandas.DataFrame.clip_upper,../reference/api/pandas.DataFrame.clip_upp generated/pandas.DataFrame.columns,../reference/api/pandas.DataFrame.columns generated/pandas.DataFrame.combine_first,../reference/api/pandas.DataFrame.combine_first generated/pandas.DataFrame.combine,../reference/api/pandas.DataFrame.combine -generated/pandas.DataFrame.compound,../reference/api/pandas.DataFrame.compound generated/pandas.DataFrame.convert_objects,../reference/api/pandas.DataFrame.convert_objects generated/pandas.DataFrame.copy,../reference/api/pandas.DataFrame.copy generated/pandas.DataFrame.corr,../reference/api/pandas.DataFrame.corr @@ -357,13 +356,9 @@ generated/pandas.DataFrame.from_csv,../reference/api/pandas.DataFrame.from_csv generated/pandas.DataFrame.from_dict,../reference/api/pandas.DataFrame.from_dict generated/pandas.DataFrame.from_items,../reference/api/pandas.DataFrame.from_items generated/pandas.DataFrame.from_records,../reference/api/pandas.DataFrame.from_records -generated/pandas.DataFrame.ftypes,../reference/api/pandas.DataFrame.ftypes generated/pandas.DataFrame.ge,../reference/api/pandas.DataFrame.ge -generated/pandas.DataFrame.get_dtype_counts,../reference/api/pandas.DataFrame.get_dtype_counts -generated/pandas.DataFrame.get_ftype_counts,../reference/api/pandas.DataFrame.get_ftype_counts generated/pandas.DataFrame.get,../reference/api/pandas.DataFrame.get generated/pandas.DataFrame.get_value,../reference/api/pandas.DataFrame.get_value -generated/pandas.DataFrame.get_values,../reference/api/pandas.DataFrame.get_values generated/pandas.DataFrame.groupby,../reference/api/pandas.DataFrame.groupby generated/pandas.DataFrame.gt,../reference/api/pandas.DataFrame.gt generated/pandas.DataFrame.head,../reference/api/pandas.DataFrame.head @@ -488,7 +483,6 @@ generated/pandas.DataFrame.T,../reference/api/pandas.DataFrame.T generated/pandas.DataFrame.timetuple,../reference/api/pandas.DataFrame.timetuple generated/pandas.DataFrame.to_clipboard,../reference/api/pandas.DataFrame.to_clipboard generated/pandas.DataFrame.to_csv,../reference/api/pandas.DataFrame.to_csv -generated/pandas.DataFrame.to_dense,../reference/api/pandas.DataFrame.to_dense generated/pandas.DataFrame.to_dict,../reference/api/pandas.DataFrame.to_dict generated/pandas.DataFrame.to_excel,../reference/api/pandas.DataFrame.to_excel generated/pandas.DataFrame.to_feather,../reference/api/pandas.DataFrame.to_feather @@ -497,7 +491,6 @@ generated/pandas.DataFrame.to_hdf,../reference/api/pandas.DataFrame.to_hdf generated/pandas.DataFrame.to,../reference/api/pandas.DataFrame.to generated/pandas.DataFrame.to_json,../reference/api/pandas.DataFrame.to_json generated/pandas.DataFrame.to_latex,../reference/api/pandas.DataFrame.to_latex -generated/pandas.DataFrame.to_msgpack,../reference/api/pandas.DataFrame.to_msgpack generated/pandas.DataFrame.to_numpy,../reference/api/pandas.DataFrame.to_numpy generated/pandas.DataFrame.to_panel,../reference/api/pandas.DataFrame.to_panel generated/pandas.DataFrame.to_parquet,../reference/api/pandas.DataFrame.to_parquet @@ -624,8 +617,6 @@ generated/pandas.Index.asi8,../reference/api/pandas.Index.asi8 generated/pandas.Index.asof,../reference/api/pandas.Index.asof generated/pandas.Index.asof_locs,../reference/api/pandas.Index.asof_locs generated/pandas.Index.astype,../reference/api/pandas.Index.astype -generated/pandas.Index.base,../reference/api/pandas.Index.base -generated/pandas.Index.contains,../reference/api/pandas.Index.contains generated/pandas.Index.copy,../reference/api/pandas.Index.copy generated/pandas.Index.data,../reference/api/pandas.Index.data generated/pandas.Index.delete,../reference/api/pandas.Index.delete @@ -635,15 +626,12 @@ generated/pandas.Index.drop,../reference/api/pandas.Index.drop generated/pandas.Index.droplevel,../reference/api/pandas.Index.droplevel generated/pandas.Index.dropna,../reference/api/pandas.Index.dropna generated/pandas.Index.dtype,../reference/api/pandas.Index.dtype -generated/pandas.Index.dtype_str,../reference/api/pandas.Index.dtype_str generated/pandas.Index.duplicated,../reference/api/pandas.Index.duplicated generated/pandas.Index.empty,../reference/api/pandas.Index.empty generated/pandas.Index.equals,../reference/api/pandas.Index.equals generated/pandas.Index.factorize,../reference/api/pandas.Index.factorize generated/pandas.Index.fillna,../reference/api/pandas.Index.fillna -generated/pandas.Index.flags,../reference/api/pandas.Index.flags generated/pandas.Index.format,../reference/api/pandas.Index.format -generated/pandas.Index.get_duplicates,../reference/api/pandas.Index.get_duplicates generated/pandas.Index.get_indexer_for,../reference/api/pandas.Index.get_indexer_for generated/pandas.Index.get_indexer,../reference/api/pandas.Index.get_indexer generated/pandas.Index.get_indexer_non_unique,../reference/api/pandas.Index.get_indexer_non_unique @@ -651,7 +639,6 @@ generated/pandas.Index.get_level_values,../reference/api/pandas.Index.get_level_ generated/pandas.Index.get_loc,../reference/api/pandas.Index.get_loc generated/pandas.Index.get_slice_bound,../reference/api/pandas.Index.get_slice_bound generated/pandas.Index.get_value,../reference/api/pandas.Index.get_value -generated/pandas.Index.get_values,../reference/api/pandas.Index.get_values generated/pandas.Index.groupby,../reference/api/pandas.Index.groupby generated/pandas.Index.has_duplicates,../reference/api/pandas.Index.has_duplicates generated/pandas.Index.hasnans,../reference/api/pandas.Index.hasnans @@ -681,7 +668,6 @@ generated/pandas.Index.is_object,../reference/api/pandas.Index.is_object generated/pandas.Index.is_type_compatible,../reference/api/pandas.Index.is_type_compatible generated/pandas.Index.is_unique,../reference/api/pandas.Index.is_unique generated/pandas.Index.item,../reference/api/pandas.Index.item -generated/pandas.Index.itemsize,../reference/api/pandas.Index.itemsize generated/pandas.Index.join,../reference/api/pandas.Index.join generated/pandas.Index.map,../reference/api/pandas.Index.map generated/pandas.Index.max,../reference/api/pandas.Index.max @@ -713,7 +699,6 @@ generated/pandas.Index.sort,../reference/api/pandas.Index.sort generated/pandas.Index.sortlevel,../reference/api/pandas.Index.sortlevel generated/pandas.Index.sort_values,../reference/api/pandas.Index.sort_values generated/pandas.Index.str,../reference/api/pandas.Index.str -generated/pandas.Index.strides,../reference/api/pandas.Index.strides generated/pandas.Index.summary,../reference/api/pandas.Index.summary generated/pandas.Index.symmetric_difference,../reference/api/pandas.Index.symmetric_difference generated/pandas.Index.take,../reference/api/pandas.Index.take @@ -792,8 +777,7 @@ generated/pandas.io.formats.style.Styler.to_excel,../reference/api/pandas.io.for generated/pandas.io.formats.style.Styler.use,../reference/api/pandas.io.formats.style.Styler.use generated/pandas.io.formats.style.Styler.where,../reference/api/pandas.io.formats.style.Styler.where generated/pandas.io.json.build_table_schema,../reference/api/pandas.io.json.build_table_schema -generated/pandas.io.json.json_normalize,../reference/api/pandas.io.json.json_normalize -generated/pandas.io.stata.StataReader.data,../reference/api/pandas.io.stata.StataReader.data +generated/pandas.io.json.json_normalize,../reference/api/pandas.json_normalize generated/pandas.io.stata.StataReader.data_label,../reference/api/pandas.io.stata.StataReader.data_label generated/pandas.io.stata.StataReader.value_labels,../reference/api/pandas.io.stata.StataReader.value_labels generated/pandas.io.stata.StataReader.variable_labels,../reference/api/pandas.io.stata.StataReader.variable_labels @@ -828,180 +812,9 @@ generated/pandas.MultiIndex.sortlevel,../reference/api/pandas.MultiIndex.sortlev generated/pandas.MultiIndex.swaplevel,../reference/api/pandas.MultiIndex.swaplevel generated/pandas.MultiIndex.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index generated/pandas.MultiIndex.to_frame,../reference/api/pandas.MultiIndex.to_frame -generated/pandas.MultiIndex.to_hierarchical,../reference/api/pandas.MultiIndex.to_hierarchical generated/pandas.notna,../reference/api/pandas.notna generated/pandas.notnull,../reference/api/pandas.notnull generated/pandas.option_context,../reference/api/pandas.option_context -generated/pandas.Panel.abs,../reference/api/pandas.Panel.abs -generated/pandas.Panel.add,../reference/api/pandas.Panel.add -generated/pandas.Panel.add_prefix,../reference/api/pandas.Panel.add_prefix -generated/pandas.Panel.add_suffix,../reference/api/pandas.Panel.add_suffix -generated/pandas.Panel.agg,../reference/api/pandas.Panel.agg -generated/pandas.Panel.aggregate,../reference/api/pandas.Panel.aggregate -generated/pandas.Panel.align,../reference/api/pandas.Panel.align -generated/pandas.Panel.all,../reference/api/pandas.Panel.all -generated/pandas.Panel.any,../reference/api/pandas.Panel.any -generated/pandas.Panel.apply,../reference/api/pandas.Panel.apply -generated/pandas.Panel.as_blocks,../reference/api/pandas.Panel.as_blocks -generated/pandas.Panel.asfreq,../reference/api/pandas.Panel.asfreq -generated/pandas.Panel.as_matrix,../reference/api/pandas.Panel.as_matrix -generated/pandas.Panel.asof,../reference/api/pandas.Panel.asof -generated/pandas.Panel.astype,../reference/api/pandas.Panel.astype -generated/pandas.Panel.at,../reference/api/pandas.Panel.at -generated/pandas.Panel.at_time,../reference/api/pandas.Panel.at_time -generated/pandas.Panel.axes,../reference/api/pandas.Panel.axes -generated/pandas.Panel.between_time,../reference/api/pandas.Panel.between_time -generated/pandas.Panel.bfill,../reference/api/pandas.Panel.bfill -generated/pandas.Panel.blocks,../reference/api/pandas.Panel.blocks -generated/pandas.Panel.bool,../reference/api/pandas.Panel.bool -generated/pandas.Panel.clip,../reference/api/pandas.Panel.clip -generated/pandas.Panel.clip_lower,../reference/api/pandas.Panel.clip_lower -generated/pandas.Panel.clip_upper,../reference/api/pandas.Panel.clip_upper -generated/pandas.Panel.compound,../reference/api/pandas.Panel.compound -generated/pandas.Panel.conform,../reference/api/pandas.Panel.conform -generated/pandas.Panel.convert_objects,../reference/api/pandas.Panel.convert_objects -generated/pandas.Panel.copy,../reference/api/pandas.Panel.copy -generated/pandas.Panel.count,../reference/api/pandas.Panel.count -generated/pandas.Panel.cummax,../reference/api/pandas.Panel.cummax -generated/pandas.Panel.cummin,../reference/api/pandas.Panel.cummin -generated/pandas.Panel.cumprod,../reference/api/pandas.Panel.cumprod -generated/pandas.Panel.cumsum,../reference/api/pandas.Panel.cumsum -generated/pandas.Panel.describe,../reference/api/pandas.Panel.describe -generated/pandas.Panel.div,../reference/api/pandas.Panel.div -generated/pandas.Panel.divide,../reference/api/pandas.Panel.divide -generated/pandas.Panel.drop,../reference/api/pandas.Panel.drop -generated/pandas.Panel.droplevel,../reference/api/pandas.Panel.droplevel -generated/pandas.Panel.dropna,../reference/api/pandas.Panel.dropna -generated/pandas.Panel.dtypes,../reference/api/pandas.Panel.dtypes -generated/pandas.Panel.empty,../reference/api/pandas.Panel.empty -generated/pandas.Panel.eq,../reference/api/pandas.Panel.eq -generated/pandas.Panel.equals,../reference/api/pandas.Panel.equals -generated/pandas.Panel.ffill,../reference/api/pandas.Panel.ffill -generated/pandas.Panel.fillna,../reference/api/pandas.Panel.fillna -generated/pandas.Panel.filter,../reference/api/pandas.Panel.filter -generated/pandas.Panel.first,../reference/api/pandas.Panel.first -generated/pandas.Panel.first_valid_index,../reference/api/pandas.Panel.first_valid_index -generated/pandas.Panel.floordiv,../reference/api/pandas.Panel.floordiv -generated/pandas.Panel.from_dict,../reference/api/pandas.Panel.from_dict -generated/pandas.Panel.fromDict,../reference/api/pandas.Panel.fromDict -generated/pandas.Panel.ftypes,../reference/api/pandas.Panel.ftypes -generated/pandas.Panel.ge,../reference/api/pandas.Panel.ge -generated/pandas.Panel.get_dtype_counts,../reference/api/pandas.Panel.get_dtype_counts -generated/pandas.Panel.get_ftype_counts,../reference/api/pandas.Panel.get_ftype_counts -generated/pandas.Panel.get,../reference/api/pandas.Panel.get -generated/pandas.Panel.get_value,../reference/api/pandas.Panel.get_value -generated/pandas.Panel.get_values,../reference/api/pandas.Panel.get_values -generated/pandas.Panel.groupby,../reference/api/pandas.Panel.groupby -generated/pandas.Panel.gt,../reference/api/pandas.Panel.gt -generated/pandas.Panel.head,../reference/api/pandas.Panel.head -generated/pandas.Panel,../reference/api/pandas.Panel -generated/pandas.Panel.iat,../reference/api/pandas.Panel.iat -generated/pandas.Panel.iloc,../reference/api/pandas.Panel.iloc -generated/pandas.Panel.infer_objects,../reference/api/pandas.Panel.infer_objects -generated/pandas.Panel.interpolate,../reference/api/pandas.Panel.interpolate -generated/pandas.Panel.is_copy,../reference/api/pandas.Panel.is_copy -generated/pandas.Panel.isna,../reference/api/pandas.Panel.isna -generated/pandas.Panel.isnull,../reference/api/pandas.Panel.isnull -generated/pandas.Panel.items,../reference/api/pandas.Panel.items -generated/pandas.Panel.__iter__,../reference/api/pandas.Panel.__iter__ -generated/pandas.Panel.iteritems,../reference/api/pandas.Panel.iteritems -generated/pandas.Panel.ix,../reference/api/pandas.Panel.ix -generated/pandas.Panel.join,../reference/api/pandas.Panel.join -generated/pandas.Panel.keys,../reference/api/pandas.Panel.keys -generated/pandas.Panel.kurt,../reference/api/pandas.Panel.kurt -generated/pandas.Panel.kurtosis,../reference/api/pandas.Panel.kurtosis -generated/pandas.Panel.last,../reference/api/pandas.Panel.last -generated/pandas.Panel.last_valid_index,../reference/api/pandas.Panel.last_valid_index -generated/pandas.Panel.le,../reference/api/pandas.Panel.le -generated/pandas.Panel.loc,../reference/api/pandas.Panel.loc -generated/pandas.Panel.lt,../reference/api/pandas.Panel.lt -generated/pandas.Panel.mad,../reference/api/pandas.Panel.mad -generated/pandas.Panel.major_axis,../reference/api/pandas.Panel.major_axis -generated/pandas.Panel.major_xs,../reference/api/pandas.Panel.major_xs -generated/pandas.Panel.mask,../reference/api/pandas.Panel.mask -generated/pandas.Panel.max,../reference/api/pandas.Panel.max -generated/pandas.Panel.mean,../reference/api/pandas.Panel.mean -generated/pandas.Panel.median,../reference/api/pandas.Panel.median -generated/pandas.Panel.min,../reference/api/pandas.Panel.min -generated/pandas.Panel.minor_axis,../reference/api/pandas.Panel.minor_axis -generated/pandas.Panel.minor_xs,../reference/api/pandas.Panel.minor_xs -generated/pandas.Panel.mod,../reference/api/pandas.Panel.mod -generated/pandas.Panel.mul,../reference/api/pandas.Panel.mul -generated/pandas.Panel.multiply,../reference/api/pandas.Panel.multiply -generated/pandas.Panel.ndim,../reference/api/pandas.Panel.ndim -generated/pandas.Panel.ne,../reference/api/pandas.Panel.ne -generated/pandas.Panel.notna,../reference/api/pandas.Panel.notna -generated/pandas.Panel.notnull,../reference/api/pandas.Panel.notnull -generated/pandas.Panel.pct_change,../reference/api/pandas.Panel.pct_change -generated/pandas.Panel.pipe,../reference/api/pandas.Panel.pipe -generated/pandas.Panel.pop,../reference/api/pandas.Panel.pop -generated/pandas.Panel.pow,../reference/api/pandas.Panel.pow -generated/pandas.Panel.prod,../reference/api/pandas.Panel.prod -generated/pandas.Panel.product,../reference/api/pandas.Panel.product -generated/pandas.Panel.radd,../reference/api/pandas.Panel.radd -generated/pandas.Panel.rank,../reference/api/pandas.Panel.rank -generated/pandas.Panel.rdiv,../reference/api/pandas.Panel.rdiv -generated/pandas.Panel.reindex_axis,../reference/api/pandas.Panel.reindex_axis -generated/pandas.Panel.reindex,../reference/api/pandas.Panel.reindex -generated/pandas.Panel.reindex_like,../reference/api/pandas.Panel.reindex_like -generated/pandas.Panel.rename_axis,../reference/api/pandas.Panel.rename_axis -generated/pandas.Panel.rename,../reference/api/pandas.Panel.rename -generated/pandas.Panel.replace,../reference/api/pandas.Panel.replace -generated/pandas.Panel.resample,../reference/api/pandas.Panel.resample -generated/pandas.Panel.rfloordiv,../reference/api/pandas.Panel.rfloordiv -generated/pandas.Panel.rmod,../reference/api/pandas.Panel.rmod -generated/pandas.Panel.rmul,../reference/api/pandas.Panel.rmul -generated/pandas.Panel.round,../reference/api/pandas.Panel.round -generated/pandas.Panel.rpow,../reference/api/pandas.Panel.rpow -generated/pandas.Panel.rsub,../reference/api/pandas.Panel.rsub -generated/pandas.Panel.rtruediv,../reference/api/pandas.Panel.rtruediv -generated/pandas.Panel.sample,../reference/api/pandas.Panel.sample -generated/pandas.Panel.select,../reference/api/pandas.Panel.select -generated/pandas.Panel.sem,../reference/api/pandas.Panel.sem -generated/pandas.Panel.set_axis,../reference/api/pandas.Panel.set_axis -generated/pandas.Panel.set_value,../reference/api/pandas.Panel.set_value -generated/pandas.Panel.shape,../reference/api/pandas.Panel.shape -generated/pandas.Panel.shift,../reference/api/pandas.Panel.shift -generated/pandas.Panel.size,../reference/api/pandas.Panel.size -generated/pandas.Panel.skew,../reference/api/pandas.Panel.skew -generated/pandas.Panel.slice_shift,../reference/api/pandas.Panel.slice_shift -generated/pandas.Panel.sort_index,../reference/api/pandas.Panel.sort_index -generated/pandas.Panel.sort_values,../reference/api/pandas.Panel.sort_values -generated/pandas.Panel.squeeze,../reference/api/pandas.Panel.squeeze -generated/pandas.Panel.std,../reference/api/pandas.Panel.std -generated/pandas.Panel.sub,../reference/api/pandas.Panel.sub -generated/pandas.Panel.subtract,../reference/api/pandas.Panel.subtract -generated/pandas.Panel.sum,../reference/api/pandas.Panel.sum -generated/pandas.Panel.swapaxes,../reference/api/pandas.Panel.swapaxes -generated/pandas.Panel.swaplevel,../reference/api/pandas.Panel.swaplevel -generated/pandas.Panel.tail,../reference/api/pandas.Panel.tail -generated/pandas.Panel.take,../reference/api/pandas.Panel.take -generated/pandas.Panel.timetuple,../reference/api/pandas.Panel.timetuple -generated/pandas.Panel.to_clipboard,../reference/api/pandas.Panel.to_clipboard -generated/pandas.Panel.to_csv,../reference/api/pandas.Panel.to_csv -generated/pandas.Panel.to_dense,../reference/api/pandas.Panel.to_dense -generated/pandas.Panel.to_excel,../reference/api/pandas.Panel.to_excel -generated/pandas.Panel.to_frame,../reference/api/pandas.Panel.to_frame -generated/pandas.Panel.to_hdf,../reference/api/pandas.Panel.to_hdf -generated/pandas.Panel.to_json,../reference/api/pandas.Panel.to_json -generated/pandas.Panel.to_latex,../reference/api/pandas.Panel.to_latex -generated/pandas.Panel.to_msgpack,../reference/api/pandas.Panel.to_msgpack -generated/pandas.Panel.to_pickle,../reference/api/pandas.Panel.to_pickle -generated/pandas.Panel.to_sparse,../reference/api/pandas.Panel.to_sparse -generated/pandas.Panel.to_sql,../reference/api/pandas.Panel.to_sql -generated/pandas.Panel.to_xarray,../reference/api/pandas.Panel.to_xarray -generated/pandas.Panel.transform,../reference/api/pandas.Panel.transform -generated/pandas.Panel.transpose,../reference/api/pandas.Panel.transpose -generated/pandas.Panel.truediv,../reference/api/pandas.Panel.truediv -generated/pandas.Panel.truncate,../reference/api/pandas.Panel.truncate -generated/pandas.Panel.tshift,../reference/api/pandas.Panel.tshift -generated/pandas.Panel.tz_convert,../reference/api/pandas.Panel.tz_convert -generated/pandas.Panel.tz_localize,../reference/api/pandas.Panel.tz_localize -generated/pandas.Panel.update,../reference/api/pandas.Panel.update -generated/pandas.Panel.values,../reference/api/pandas.Panel.values -generated/pandas.Panel.var,../reference/api/pandas.Panel.var -generated/pandas.Panel.where,../reference/api/pandas.Panel.where -generated/pandas.Panel.xs,../reference/api/pandas.Panel.xs generated/pandas.Period.asfreq,../reference/api/pandas.Period.asfreq generated/pandas.Period.day,../reference/api/pandas.Period.day generated/pandas.Period.dayofweek,../reference/api/pandas.Period.dayofweek @@ -1075,7 +888,6 @@ generated/pandas.read_gbq,../reference/api/pandas.read_gbq generated/pandas.read_hdf,../reference/api/pandas.read_hdf generated/pandas.read,../reference/api/pandas.read generated/pandas.read_json,../reference/api/pandas.read_json -generated/pandas.read_msgpack,../reference/api/pandas.read_msgpack generated/pandas.read_parquet,../reference/api/pandas.read_parquet generated/pandas.read_pickle,../reference/api/pandas.read_pickle generated/pandas.read_sas,../reference/api/pandas.read_sas @@ -1111,7 +923,6 @@ generated/pandas.Series.at,../reference/api/pandas.Series.at generated/pandas.Series.at_time,../reference/api/pandas.Series.at_time generated/pandas.Series.autocorr,../reference/api/pandas.Series.autocorr generated/pandas.Series.axes,../reference/api/pandas.Series.axes -generated/pandas.Series.base,../reference/api/pandas.Series.base generated/pandas.Series.between,../reference/api/pandas.Series.between generated/pandas.Series.between_time,../reference/api/pandas.Series.between_time generated/pandas.Series.bfill,../reference/api/pandas.Series.bfill @@ -1134,7 +945,6 @@ generated/pandas.Series.clip_lower,../reference/api/pandas.Series.clip_lower generated/pandas.Series.clip_upper,../reference/api/pandas.Series.clip_upper generated/pandas.Series.combine_first,../reference/api/pandas.Series.combine_first generated/pandas.Series.combine,../reference/api/pandas.Series.combine -generated/pandas.Series.compound,../reference/api/pandas.Series.compound generated/pandas.Series.compress,../reference/api/pandas.Series.compress generated/pandas.Series.convert_objects,../reference/api/pandas.Series.convert_objects generated/pandas.Series.copy,../reference/api/pandas.Series.copy @@ -1220,18 +1030,12 @@ generated/pandas.Series.fillna,../reference/api/pandas.Series.fillna generated/pandas.Series.filter,../reference/api/pandas.Series.filter generated/pandas.Series.first,../reference/api/pandas.Series.first generated/pandas.Series.first_valid_index,../reference/api/pandas.Series.first_valid_index -generated/pandas.Series.flags,../reference/api/pandas.Series.flags generated/pandas.Series.floordiv,../reference/api/pandas.Series.floordiv generated/pandas.Series.from_array,../reference/api/pandas.Series.from_array generated/pandas.Series.from_csv,../reference/api/pandas.Series.from_csv -generated/pandas.Series.ftype,../reference/api/pandas.Series.ftype -generated/pandas.Series.ftypes,../reference/api/pandas.Series.ftypes generated/pandas.Series.ge,../reference/api/pandas.Series.ge -generated/pandas.Series.get_dtype_counts,../reference/api/pandas.Series.get_dtype_counts -generated/pandas.Series.get_ftype_counts,../reference/api/pandas.Series.get_ftype_counts generated/pandas.Series.get,../reference/api/pandas.Series.get generated/pandas.Series.get_value,../reference/api/pandas.Series.get_value -generated/pandas.Series.get_values,../reference/api/pandas.Series.get_values generated/pandas.Series.groupby,../reference/api/pandas.Series.groupby generated/pandas.Series.gt,../reference/api/pandas.Series.gt generated/pandas.Series.hasnans,../reference/api/pandas.Series.hasnans @@ -1256,7 +1060,6 @@ generated/pandas.Series.isnull,../reference/api/pandas.Series.isnull generated/pandas.Series.is_unique,../reference/api/pandas.Series.is_unique generated/pandas.Series.item,../reference/api/pandas.Series.item generated/pandas.Series.items,../reference/api/pandas.Series.items -generated/pandas.Series.itemsize,../reference/api/pandas.Series.itemsize generated/pandas.Series.__iter__,../reference/api/pandas.Series.__iter__ generated/pandas.Series.iteritems,../reference/api/pandas.Series.iteritems generated/pandas.Series.ix,../reference/api/pandas.Series.ix @@ -1307,7 +1110,6 @@ generated/pandas.Series.pow,../reference/api/pandas.Series.pow generated/pandas.Series.prod,../reference/api/pandas.Series.prod generated/pandas.Series.product,../reference/api/pandas.Series.product generated/pandas.Series.ptp,../reference/api/pandas.Series.ptp -generated/pandas.Series.put,../reference/api/pandas.Series.put generated/pandas.Series.quantile,../reference/api/pandas.Series.quantile generated/pandas.Series.radd,../reference/api/pandas.Series.radd generated/pandas.Series.rank,../reference/api/pandas.Series.rank @@ -1369,7 +1171,6 @@ generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get generated/pandas.Series.str,../reference/api/pandas.Series.str -generated/pandas.Series.strides,../reference/api/pandas.Series.strides generated/pandas.Series.str.index,../reference/api/pandas.Series.str.index generated/pandas.Series.str.isalnum,../reference/api/pandas.Series.str.isalnum generated/pandas.Series.str.isalpha,../reference/api/pandas.Series.str.isalpha @@ -1419,7 +1220,6 @@ generated/pandas.Series.T,../reference/api/pandas.Series.T generated/pandas.Series.timetuple,../reference/api/pandas.Series.timetuple generated/pandas.Series.to_clipboard,../reference/api/pandas.Series.to_clipboard generated/pandas.Series.to_csv,../reference/api/pandas.Series.to_csv -generated/pandas.Series.to_dense,../reference/api/pandas.Series.to_dense generated/pandas.Series.to_dict,../reference/api/pandas.Series.to_dict generated/pandas.Series.to_excel,../reference/api/pandas.Series.to_excel generated/pandas.Series.to_frame,../reference/api/pandas.Series.to_frame @@ -1428,7 +1228,6 @@ generated/pandas.Series.to_json,../reference/api/pandas.Series.to_json generated/pandas.Series.to_latex,../reference/api/pandas.Series.to_latex generated/pandas.Series.to_list,../reference/api/pandas.Series.to_list generated/pandas.Series.tolist,../reference/api/pandas.Series.tolist -generated/pandas.Series.to_msgpack,../reference/api/pandas.Series.to_msgpack generated/pandas.Series.to_numpy,../reference/api/pandas.Series.to_numpy generated/pandas.Series.to_period,../reference/api/pandas.Series.to_period generated/pandas.Series.to_pickle,../reference/api/pandas.Series.to_pickle diff --git a/doc/source/_static/favicon.ico b/doc/source/_static/favicon.ico deleted file mode 100644 index d15c4803b62e6..0000000000000 Binary files a/doc/source/_static/favicon.ico and /dev/null differ diff --git a/doc/source/conf.py b/doc/source/conf.py index b4f719b6e64b2..481c03ab8f388 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -204,7 +204,11 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -# html_theme_options = {} +html_theme_options = { + "external_links": [], + "github_url": "https://github.com/pandas-dev/pandas", + "twitter_url": "https://twitter.com/pandas_dev", +} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = ["themes"] @@ -228,7 +232,7 @@ # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = os.path.join(html_static_path[0], "favicon.ico") +html_favicon = "../../web/pandas/static/img/favicon.ico" # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -296,12 +300,7 @@ for method in methods: # ... and each of its public methods - moved_api_pages.append( - ( - "{old}.{method}".format(old=old, method=method), - "{new}.{method}".format(new=new, method=method), - ) - ) + moved_api_pages.append((f"{old}.{method}", f"{new}.{method}",)) if pattern is None: html_additional_pages = { @@ -309,7 +308,7 @@ } -header = """\ +header = f"""\ .. currentmodule:: pandas .. ipython:: python @@ -323,10 +322,8 @@ pd.options.display.max_rows = 15 import os - os.chdir(r'{}') -""".format( - os.path.dirname(os.path.dirname(__file__)) -) + os.chdir(r'{os.path.dirname(os.path.dirname(__file__))}') +""" html_context = { @@ -575,7 +572,7 @@ def _add_deprecation_prefixes(self, items): for item in items: display_name, sig, summary, real_name = item if self._is_deprecated(real_name): - summary = "(DEPRECATED) %s" % summary + summary = f"(DEPRECATED) {summary}" yield display_name, sig, summary, real_name def get_items(self, names): @@ -620,19 +617,18 @@ def linkcode_resolve(domain, info): lineno = None if lineno: - linespec = "#L{:d}-L{:d}".format(lineno, lineno + len(source) - 1) + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" else: linespec = "" fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) if "+" in pandas.__version__: - return "http://github.com/pandas-dev/pandas/blob/master/pandas/{}{}".format( - fn, linespec - ) + return f"http://github.com/pandas-dev/pandas/blob/master/pandas/{fn}{linespec}" else: - return "http://github.com/pandas-dev/pandas/blob/v{}/pandas/{}{}".format( - pandas.__version__, fn, linespec + return ( + f"http://github.com/pandas-dev/pandas/blob/" + f"v{pandas.__version__}/pandas/{fn}{linespec}" ) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst new file mode 100644 index 0000000000000..2fc2f1fb6ee8d --- /dev/null +++ b/doc/source/development/code_style.rst @@ -0,0 +1,129 @@ +.. _code_style: + +{{ header }} + +======================= +pandas code style guide +======================= + +.. contents:: Table of contents: + :local: + +Patterns +======== + +foo.__class__ +------------- + +*pandas* uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. + +For example: + +**Good:** + +.. code-block:: python + + foo = "bar" + type(foo) + +**Bad:** + +.. code-block:: python + + foo = "bar" + foo.__class__ + + +String formatting +================= + +Concatenated strings +-------------------- + +f-strings +~~~~~~~~~ + +*pandas* uses f-strings formatting instead of '%' and '.format()' string formatters. + +The convention of using f-strings on a string that is concatenated over serveral lines, +is to prefix only the lines containing the value needs to be interpeted. + +For example: + +**Good:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + "please use the new and way better " + f"{bar}" + ) + +**Bad:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + f"please use the new and way better " + f"{bar}" + ) + +White spaces +~~~~~~~~~~~~ + +Putting the white space only at the end of the previous line, so +there is no whitespace at the beggining of the concatenated string. + +For example: + +**Good:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string, " + "with good placement of the " + "whitespaces" + ) + +**Bad:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string," + " with bad placement of the" + " whitespaces" + ) + +Representation function (aka 'repr()') +-------------------------------------- + +*pandas* uses 'repr()' instead of '%r' and '!r'. + +The use of 'repr()' will only happend when the value is not an obvious string. + +For example: + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived value, got: {repr(value)}" + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived type, got: '{type(value).__name__}'" diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 677e28b60c51d..93c65ba7358c9 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -24,6 +24,27 @@ and `good first issue where you could start out. Once you've found an interesting issue, you can return here to get your development environment setup. +When you start working on an issue, it's a good idea to assign the issue to yourself, +so nobody else duplicates the work on it. GitHub restricts assigning issues to maintainers +of the project only. In most projects, and until recently in pandas, contributors added a +comment letting others know they are working on an issue. While this is ok, you need to +check each issue individually, and it's not possible to find the unassigned ones. + +For this reason, we implemented a workaround consisting of adding a comment with the exact +text `take`. When you do it, a GitHub action will automatically assign you the issue +(this will take seconds, and may require refreshint the page to see it). +By doing this, it's possible to filter the list of issues and find only the unassigned ones. + +So, a good way to find an issue to start contributing to pandas is to check the list of +`unassigned good first issues `_ +and assign yourself one you like by writing a comment with the exact text `take`. + +If for whatever reason you are not able to continue working with the issue, please try to +unassign it, so other people know it's available again. You can check the list of +assigned issues, since people may not be working in them anymore. If you want to work on one +that is assigned, feel free to kindly ask the current assignee if you can take it +(please allow at least a week of inactivity before considering work in the issue discontinued). + Feel free to ask questions on the `mailing list `_ or on `Gitter`_. @@ -208,7 +229,7 @@ We'll now kick off a three-step process: # Build and install pandas python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation + python -m pip install -e . --no-build-isolation --no-use-pep517 At this point you should be able to import pandas from your locally built version:: @@ -236,14 +257,17 @@ Creating a Python environment (pip) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you aren't using conda for your development environment, follow these instructions. -You'll need to have at least python3.5 installed on your system. +You'll need to have at least Python 3.6.1 installed on your system. -.. code-block:: none +**Unix**/**Mac OS** + +.. code-block:: bash # Create a virtual environment # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev # Any parent directories should already exist python3 -m venv ~/virtualenvs/pandas-dev + # Activate the virtualenv . ~/virtualenvs/pandas-dev/bin/activate @@ -251,8 +275,34 @@ You'll need to have at least python3.5 installed on your system. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 - python -m pip install -e . --no-build-isolation + python setup.py build_ext --inplace -j 0 + python -m pip install -e . --no-build-isolation --no-use-pep517 + +**Windows** + +Below is a brief overview on how to set-up a virtual environment with Powershell +under Windows. For details please refer to the +`official virtualenv user guide `__ + +Use an ENV_DIR of your choice. We'll use ~\virtualenvs\pandas-dev where +'~' is the folder pointed to by either $env:USERPROFILE (Powershell) or +%USERPROFILE% (cmd.exe) environment variable. Any parent directories +should already exist. + +.. code-block:: powershell + + # Create a virtual environment + python -m venv $env:USERPROFILE\virtualenvs\pandas-dev + + # Activate the virtualenv. Use activate.bat for cmd.exe + ~\virtualenvs\pandas-dev\Scripts\Activate.ps1 + + # Install the build dependencies + python -m pip install -r requirements-dev.txt + + # Build and install pandas + python setup.py build_ext --inplace -j 0 + python -m pip install -e . --no-build-isolation --no-use-pep517 Creating a branch ----------------- @@ -384,7 +434,7 @@ The utility script ``scripts/validate_docstrings.py`` can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of methods documented in ``doc/source/api.rst`` (which is used to generate -the `API Reference `_ page) +the `API Reference `_ page) and the actual public methods. This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. @@ -453,7 +503,7 @@ reducing the turn-around time for checking your changes. python make.py --no-api # compile the docs with only a single section, relative to the "source" folder. - # For example, compiling only this guide (docs/source/development/contributing.rst) + # For example, compiling only this guide (doc/source/development/contributing.rst) python make.py clean python make.py --single development/contributing.rst @@ -519,8 +569,7 @@ do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -Additional standards are outlined on the `code style wiki -page `_. +Additional standards are outlined on the `pandas code style guide `_ Optional dependencies --------------------- @@ -604,6 +653,9 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. +You should use a ``black`` version >= 19.10b0 as previous versions are not compatible +with the pandas codebase. + Optionally, you may wish to setup `pre-commit hooks `_ to automatically run ``black`` and ``flake8`` when you make a git commit. This can be done by installing ``pre-commit``:: @@ -751,7 +803,7 @@ Types imports should follow the ``from typing import ...`` convention. So rather import typing - primes = [] # type: typing.List[int] + primes: typing.List[int] = [] You should write @@ -759,19 +811,19 @@ You should write from typing import List, Optional, Union - primes = [] # type: List[int] + primes: List[int] = [] ``Optional`` should be used where applicable, so instead of .. code-block:: python - maybe_primes = [] # type: List[Union[int, None]] + maybe_primes: List[Union[int, None]] = [] You should write .. code-block:: python - maybe_primes = [] # type: List[Optional[int]] + maybe_primes: List[Optional[int]] = [] In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -787,7 +839,7 @@ The appropriate way to annotate this would be as follows str_type = str class SomeClass2: - str = None # type: str_type + str: str_type = None In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example @@ -818,29 +870,6 @@ The limitation here is that while a human can reasonably understand that ``is_nu With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. -Syntax Requirements -~~~~~~~~~~~~~~~~~~~ - -Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas: - -.. code-block:: python - - primes = [] # type: List[int] - -Whereas this is **NOT** allowed: - -.. code-block:: python - - primes: List[int] = [] # not supported in Python 3.5! - -Note that function signatures can always be annotated per :pep:`3107`: - -.. code-block:: python - - def sum_of_primes(primes: List[int] = []) -> int: - ... - - Pandas-specific Types ~~~~~~~~~~~~~~~~~~~~~ @@ -916,7 +945,7 @@ extensions in `numpy.testing .. note:: - The earliest supported pytest version is 4.0.2. + The earliest supported pytest version is 5.0.1. Writing tests ~~~~~~~~~~~~~ @@ -927,7 +956,7 @@ inspiration. If your test requires working with files or network connectivity, there is more information on the `testing page `_ of the wiki. -The ``pandas.util.testing`` module has many special ``assert`` functions that +The ``pandas._testing`` module has many special ``assert`` functions that make it easier to make statements about whether Series or DataFrame objects are equivalent. The easiest way to verify that your code is correct is to explicitly construct the result you expect, then compare the actual result to @@ -1113,7 +1142,7 @@ If your change involves checking that a warning is actually emitted, use .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm df = pd.DataFrame() @@ -1267,7 +1296,7 @@ environment by:: or, to use a specific Python interpreter,:: - asv run -e -E existing:python3.5 + asv run -e -E existing:python3.6 This will display stderr from the benchmarks, and use your local ``python`` that comes from your ``$PATH``. @@ -1334,6 +1363,7 @@ some common prefixes along with general guidelines for when to use them: * TST: Additions/updates to tests * BLD: Updates to the build process/scripts * PERF: Performance improvement +* TYP: Type annotations * CLN: Code cleanup The following defines how a commit message should be structured. Please reference the diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 34bc5f44eb0c0..cb32f0e1ee475 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -22,39 +22,39 @@ Next example gives an idea on how a docstring looks like: .. code-block:: python def add(num1, num2): - """ - Add up two integer numbers. - - This function simply wraps the `+` operator, and does not - do anything interesting, except for illustrating what is - the docstring of a very simple function. - - Parameters - ---------- - num1 : int - First number to add - num2 : int - Second number to add - - Returns - ------- - int - The sum of `num1` and `num2` - - See Also - -------- - subtract : Subtract one integer from another - - Examples - -------- - >>> add(2, 2) - 4 - >>> add(25, 0) - 25 - >>> add(10, -10) - 0 - """ - return num1 + num2 + """ + Add up two integer numbers. + + This function simply wraps the `+` operator, and does not + do anything interesting, except for illustrating what is + the docstring of a very simple function. + + Parameters + ---------- + num1 : int + First number to add + num2 : int + Second number to add + + Returns + ------- + int + The sum of `num1` and `num2` + + See Also + -------- + subtract : Subtract one integer from another + + Examples + -------- + >>> add(2, 2) + 4 + >>> add(25, 0) + 25 + >>> add(10, -10) + 0 + """ + return num1 + num2 Some standards exist about docstrings, so they are easier to read, and they can be exported to other formats such as html or pdf. @@ -399,7 +399,7 @@ DataFrame: * DataFrame * pandas.Index * pandas.Categorical -* pandas.SparseArray +* pandas.arrays.SparseArray If the exact type is not relevant, but must be compatible with a numpy array, array-like can be specified. If Any type that can be iterated is diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 923ef005d5926..33646e5d74757 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -125,7 +125,6 @@ The ``metadata`` field is ``None`` except for: in ``BYTE_ARRAY`` Parquet columns. The encoding can be one of: * ``'pickle'`` - * ``'msgpack'`` * ``'bson'`` * ``'json'`` diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index e341dcb8318bc..89d43e8a43825 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -251,6 +251,48 @@ To use a test, subclass it: See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py for a list of all the tests available. +.. _extending.extension.arrow: + +Compatibility with Apache Arrow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An ``ExtensionArray`` can support conversion to / from ``pyarrow`` arrays +(and thus support for example serialization to the Parquet file format) +by implementing two methods: ``ExtensionArray.__arrow_array__`` and +``ExtensionDtype.__from_arrow__``. + +The ``ExtensionArray.__arrow_array__`` ensures that ``pyarrow`` knowns how +to convert the specific extension array into a ``pyarrow.Array`` (also when +included as a column in a pandas DataFrame): + +.. code-block:: python + + class MyExtensionArray(ExtensionArray): + ... + + def __arrow_array__(self, type=None): + # convert the underlying array values to a pyarrow Array + import pyarrow + return pyarrow.array(..., type=type) + +The ``ExtensionDtype.__from_arrow__`` method then controls the conversion +back from pyarrow to a pandas ExtensionArray. This method receives a pyarrow +``Array`` or ``ChunkedArray`` as only argument and is expected to return the +appropriate pandas ``ExtensionArray`` for this dtype and the passed values: + +.. code-block:: none + + class ExtensionDtype: + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> ExtensionArray: + ... + +See more in the `Arrow documentation `__. + +Those methods have been implemented for the nullable integer and string extension +dtypes included in pandas, and ensure roundtrip to pyarrow and the Parquet file format. + .. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 1228f00667f3a..f8a6bb6deb52d 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,8 +13,11 @@ Development :maxdepth: 2 contributing + code_style + maintaining internals extending developer policies roadmap + meeting diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst new file mode 100644 index 0000000000000..0d1088cc8a6ca --- /dev/null +++ b/doc/source/development/maintaining.rst @@ -0,0 +1,193 @@ +.. _maintaining: + +****************** +Pandas Maintenance +****************** + +This guide is for pandas' maintainers. It may also be interesting to contributors +looking to understand the pandas development process and what steps are necessary +to become a maintainer. + +The main contributing guide is available at :ref:`contributing`. + +Roles +----- + +Pandas uses two levels of permissions: **triage** and **core** team members. + +Triage members can label and close issues and pull requests. + +Core team members can label and close issues and pull request, and can merge +pull requests. + +GitHub publishes the full `list of permissions`_. + +Tasks +----- + +Pandas is largely a volunteer project, so these tasks shouldn't be read as +"expectations" of triage and maintainers. Rather, they're general descriptions +of what it means to be a maintainer. + +* Triage newly filed issues (see :ref:`maintaining.triage`) +* Review newly opened pull requests +* Respond to updates on existing issues and pull requests +* Drive discussion and decisions on stalled issues and pull requests +* Provide experience / wisdom on API design questions to ensure consistency and maintainability +* Project organization (run / attend developer meetings, represent pandas) + +http://matthewrocklin.com/blog/2019/05/18/maintainer may be interesting background +reading. + +.. _maintaining.triage: + +Issue Triage +------------ + + +Here's a typical workflow for triaging a newly opened issue. + +1. **Thank the reporter for opening an issue** + + The issue tracker is many people's first interaction with the pandas project itself, + beyond just using the library. As such, we want it to be a welcoming, pleasant + experience. + +2. **Is the necessary information provided?** + + Ideally reporters would fill out the issue template, but many don't. + If crucial information (like the version of pandas they used), is missing + feel free to ask for that and label the issue with "Needs info". The + report should follow the guidelines in :ref:`contributing.bug_reports`. + You may want to link to that if they didn't follow the template. + + Make sure that the title accurately reflects the issue. Edit it yourself + if it's not clear. + +3. **Is this a duplicate issue?** + + We have many open issues. If a new issue is clearly a duplicate, label the + new issue as "Duplicate" assign the milestone "No Action", and close the issue + with a link to the original issue. Make sure to still thank the reporter, and + encourage them to chime in on the original issue, and perhaps try to fix it. + + If the new issue provides relevant information, such as a better or slightly + different example, add it to the original issue as a comment or an edit to + the original post. + +4. **Is the issue minimal and reproducible**? + + For bug reports, we ask that the reporter provide a minimal reproducible + example. See http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports + for a good explanation. If the example is not reproducible, or if it's + *clearly* not minimal, feel free to ask the reporter if they can provide + and example or simplify the provided one. Do acknowledge that writing + minimal reproducible examples is hard work. If the reporter is struggling, + you can try to write one yourself and we'll edit the original post to include it. + + If a reproducible example can't be provided, add the "Needs info" label. + + If a reproducible example is provided, but you see a simplification, + edit the original post with your simpler reproducible example. + +5. **Is this a clearly defined feature request?** + + Generally, pandas prefers to discuss and design new features in issues, before + a pull request is made. Encourage the submitter to include a proposed API + for the new feature. Having them write a full docstring is a good way to + pin down specifics. + + We'll need a discussion from several pandas maintainers before deciding whether + the proposal is in scope for pandas. + +6. **Is this a usage question?** + + We prefer that usage questions are asked on StackOverflow with the pandas + tag. https://stackoverflow.com/questions/tagged/pandas + + If it's easy to answer, feel free to link to the relevant documentation section, + let them know that in the future this kind of question should be on + StackOverflow, and close the issue. + +7. **What labels and milestones should I add?** + + Apply the relevant labels. This is a bit of an art, and comes with experience. + Look at similar issues to get a feel for how things are labeled. + + If the issue is clearly defined and the fix seems relatively straightforward, + label the issue as "Good first issue". + + Typically, new issues will be assigned the "Contributions welcome" milestone, + unless it's know that this issue should be addressed in a specific release (say + because it's a large regression). + +.. _maintaining.closing: + +Closing Issues +-------------- + +Be delicate here: many people interpret closing an issue as us saying that the +conversation is over. It's typically best to give the reporter some time to +respond or self-close their issue if it's determined that the behavior is not a bug, +or the feature is out of scope. Sometimes reporters just go away though, and +we'll close the issue after the conversation has died. + +Reviewing Pull Requests +----------------------- + +Anybody can review a pull request: regular contributors, triagers, or core-team +members. Here are some guidelines to check. + +* Tests should be in a sensible location. +* New public APIs should be included somewhere in ``doc/source/reference/``. +* New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. +* User-facing changes should have a whatsnew in the appropriate file. +* Regression tests should reference the original GitHub issue number like ``# GH-1234``. + +Cleaning up old Issues +---------------------- + +Every open issue in pandas has a cost. Open issues make finding duplicates harder, +and can make it harder to know what needs to be done in pandas. That said, closing +issues isn't a goal on its own. Our goal is to make pandas the best it can be, +and that's best done by ensuring that the quality of our open issues is high. + +Occasionally, bugs are fixed but the issue isn't linked to in the Pull Request. +In these cases, comment that "This has been fixed, but could use a test." and +label the issue as "Good First Issue" and "Needs Test". + +If an older issue doesn't follow our issue template, edit the original post to +include a minimal example, the actual output, and the expected output. Uniformity +in issue reports is valuable. + +If an older issue lacks a reproducible example, label it as "Needs Info" and +ask them to provide one (or write one yourself if possible). If one isn't +provide reasonably soon, close it according to the policies in :ref:`maintaining.closing`. + +Cleaning up old Pull Requests +----------------------------- + +Occasionally, contributors are unable to finish off a pull request. +If some time has passed (two weeks, say) since the last review requesting changes, +gently ask if they're still interested in working on this. If another two weeks or +so passes with no response, thank them for their work and close the pull request. +Comment on the original issue that "There's a stalled PR at #1234 that may be +helpful.", and perhaps label the issue as "Good first issue" if the PR was relatively +close to being accepted. + +Additionally, core-team members can push to contributors branches. This can be +helpful for pushing an important PR across the line, or for fixing a small +merge conflict. + +Becoming a pandas maintainer +---------------------------- + +The full process is outlined in our `governance documents`_. In summary, +we're happy to give triage permissions to anyone who shows interest by +being helpful on the issue tracker. + +The current list of core-team members is at +https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _governance documents: https://github.com/pandas-dev/pandas-governance +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst new file mode 100644 index 0000000000000..1d19408692cda --- /dev/null +++ b/doc/source/development/meeting.rst @@ -0,0 +1,32 @@ +.. _meeting: + +================== +Developer Meetings +================== + +We hold regular developer meetings on the second Wednesday +of each month at 18:00 UTC. These meetings and their minutes are open to +the public. All are welcome to join. + +Minutes +------- + +The minutes of past meetings are available in `this Google Document `__. + +Calendar +-------- + +This calendar shows all the developer meetings. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 2083a30af09c3..224948738341e 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -51,7 +51,7 @@ Pandas may change the behavior of experimental features at any time. Python Support ~~~~~~~~~~~~~~ -Pandas will only drop support for specific Python versions (e.g. 3.5.x, 3.6.x) in +Pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in pandas **major** releases. .. _SemVer: https://semver.org diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 48c722bc16a86..7bd5ba7ecdf0b 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -244,8 +244,8 @@ Pandas DataFrames with timeseries indexes. `pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the -`Thomson Dataworks Enterprise (DWE/Datastream) `__ -SOAP API to return indexed Pandas DataFrames with financial data. +`Refinitiv Datastream (DWS) `__ +REST API to return indexed Pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -327,6 +327,21 @@ PyTables, h5py, and pymongo to move data between non pandas formats. Its graph based approach is also extensible by end users for custom formats that may be too specific for the core of odo. +`Pandarallel `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. +If also displays progress bars. + +.. code:: python + + from pandarallel import pandarallel + + pandarallel.initialize(progress_bar=True) + + # df.apply(func) + df.parallel_apply(func) + `Ray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -380,4 +395,3 @@ Library Accessor Classes .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ - diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 41520795bde62..3055a22129b91 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -75,8 +75,8 @@ will be completed: df2.all df2.columns df2.any df2.combine df2.append df2.combine_first - df2.apply df2.compound - df2.applymap df2.consolidate + df2.apply df2.consolidate + df2.applymap df2.D As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically @@ -697,8 +697,9 @@ Plotting See the :ref:`Plotting ` docs. +We use the standard convention for referencing the matplotlib API: + .. ipython:: python - :suppress: import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 9b97aa25a9240..4fef5efbd1551 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -753,28 +753,51 @@ on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. Tablewise function application ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``DataFrames`` and ``Series`` can of course just be passed into functions. +``DataFrames`` and ``Series`` can be passed into functions. However, if the function needs to be called in a chain, consider using the :meth:`~DataFrame.pipe` method. -Compare the following -.. code-block:: python +First some setup: + +.. ipython:: python - # f, g, and h are functions taking and returning ``DataFrames`` - >>> f(g(h(df), arg1=1), arg2=2, arg3=3) + def extract_city_name(df): + """ + Chicago, IL -> Chicago for city_name column + """ + df['city_name'] = df['city_and_code'].str.split(",").str.get(0) + return df -with the equivalent + def add_country_name(df, country_name=None): + """ + Chicago -> Chicago-US for city_name column + """ + col = 'city_name' + df['city_and_country'] = df[col] + country_name + return df -.. code-block:: python + df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']}) + + +``extract_city_name`` and ``add_country_name`` are functions taking and returning ``DataFrames``. + +Now compare the following: + +.. ipython:: python + + add_country_name(extract_city_name(df_p), country_name='US') + +Is equivalent to: + +.. ipython:: python - >>> (df.pipe(h) - ... .pipe(g, arg1=1) - ... .pipe(f, arg2=2, arg3=3)) + (df_p.pipe(extract_city_name) + .pipe(add_country_name, country_name="US")) Pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions in method chains, alongside pandas' methods. -In the example above, the functions ``f``, ``g``, and ``h`` each expected the ``DataFrame`` as the first positional argument. +In the example above, the functions ``extract_city_name`` and ``add_country_name`` each expected a ``DataFrame`` as the first positional argument. What if the function you wish to apply takes its data as, say, the second argument? In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. ``.pipe`` will route the ``DataFrame`` to the argument specified in the tuple. @@ -1914,20 +1937,36 @@ See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party libraries that have implemented an extension. -The following table lists all of pandas extension types. See the respective +The following table lists all of pandas extension types. For methods requiring ``dtype`` +arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. -=================== ========================= ================== ============================= ============================= -Kind of Data Data Type Scalar Array Documentation -=================== ========================= ================== ============================= ============================= -tz-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` -Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` -period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` -sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` -intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` -nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` -Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` -=================== ========================= ================== ============================= ============================= ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Kind of Data | Data Type | Scalar | Array | String Aliases | Documentation | ++===================+===========================+====================+===============================+=========================================+===============================+ +| tz-aware datetime | :class:`DatetimeTZDtype` | :class:`Timestamp` | :class:`arrays.DatetimeArray` | ``'datetime64[ns, ]'`` | :ref:`timeseries.timezone` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Categorical | :class:`CategoricalDtype` | (none) | :class:`Categorical` | ``'category'`` | :ref:`categorical` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| period | :class:`PeriodDtype` | :class:`Period` | :class:`arrays.PeriodArray` | ``'period[]'``, | :ref:`timeseries.periods` | +| (time spans) | | | | ``'Period[]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| sparse | :class:`SparseDtype` | (none) | :class:`arrays.SparseArray` | ``'Sparse'``, ``'Sparse[int]'``, | :ref:`sparse` | +| | | | | ``'Sparse[float]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| intervals | :class:`IntervalDtype` | :class:`Interval` | :class:`arrays.IntervalArray` | ``'interval'``, ``'Interval'``, | :ref:`advanced.intervalindex` | +| | | | | ``'Interval[]'``, | | +| | | | | ``'Interval[datetime64[ns, ]]'``, | | +| | | | | ``'Interval[timedelta64[]]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| nullable integer + :class:`Int64Dtype`, ... | (none) | :class:`arrays.IntegerArray` | ``'Int8'``, ``'Int16'``, ``'Int32'``, | :ref:`integer_na` | +| | | | | ``'Int64'``, ``'UInt8'``, ``'UInt16'``, | | +| | | | | ``'UInt32'``, ``'UInt64'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Strings | :class:`StringDtype` | :class:`str` | :class:`arrays.StringArray` | ``'string'`` | :ref:`text` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ Pandas has two ways to store strings. @@ -1982,7 +2021,7 @@ The number of columns of each type in a ``DataFrame`` can be found by calling Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, -or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, +or a passed ``Series``), then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. .. ipython:: python diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 69bb700c97b15..4e284fe7b5968 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -629,7 +629,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index db687386329bb..fec6bae1e0330 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -617,7 +617,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index 9e18951fe3f4c..8bd271815549d 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -564,53 +564,6 @@ to a column created earlier in the same :meth:`~DataFrame.assign`. In the second expression, ``x['C']`` will refer to the newly created column, that's equal to ``dfa['A'] + dfa['B']``. -To write code compatible with all versions of Python, split the assignment in two. - -.. ipython:: python - - dependent = pd.DataFrame({"A": [1, 1, 1]}) - (dependent.assign(A=lambda x: x['A'] + 1) - .assign(B=lambda x: x['A'] + 2)) - -.. warning:: - - Dependent assignment may subtly change the behavior of your code between - Python 3.6 and older versions of Python. - - If you wish to write code that supports versions of python before and after 3.6, - you'll need to take care when passing ``assign`` expressions that - - * Update an existing column - * Refer to the newly updated column in the same ``assign`` - - For example, we'll update column "A" and then refer to it when creating "B". - - .. code-block:: python - - >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) - >>> dependent.assign(A=lambda x: x["A"] + 1, B=lambda x: x["A"] + 2) - - For Python 3.5 and earlier the expression creating ``B`` refers to the - "old" value of ``A``, ``[1, 1, 1]``. The output is then - - .. code-block:: console - - A B - 0 2 3 - 1 2 3 - 2 2 3 - - For Python 3.6 and later, the expression creating ``A`` refers to the - "new" value of ``A``, ``[2, 2, 2]``, which results in - - .. code-block:: console - - A B - 0 2 4 - 1 2 4 - 2 2 4 - - Indexing / selection ~~~~~~~~~~~~~~~~~~~~ @@ -723,11 +676,11 @@ similar to an ndarray: # only show the first 5 rows df[:5].T +.. _dsintro.numpy_interop: + DataFrame interoperability with NumPy functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _dsintro.numpy_interop: - Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions can be used with no issues on Series and DataFrame, assuming the data within are numeric: @@ -788,7 +741,7 @@ implementation takes precedence and a Series is returned. np.maximum(ser, idx) NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays, -for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible, +for example :class:`arrays.SparseArray` (see :ref:`sparse.calculation`). If possible, the ufunc is applied without converting the underlying data to an ndarray. Console display diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 7d1150c2f65fa..b3fd443e662a9 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.5.3 and above, 3.6, 3.7, and 3.8. +Officially Python 3.6.1 and above, 3.7, and 3.8. Installing pandas ----------------- @@ -140,7 +140,7 @@ Installing with ActivePython Installation instructions for `ActivePython `__ can be found `here `__. Versions -2.7 and 3.5 include pandas. +2.7, 3.5 and 3.6 include pandas. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -177,7 +177,7 @@ pandas is equipped with an exhaustive set of unit tests, covering about 97% of the code base as of this writing. To run it on your machine to verify that everything is working (and that you have all of the dependencies, soft and hard, installed), make sure you have `pytest -`__ >= 4.0.2 and `Hypothesis +`__ >= 5.0.1 and `Hypothesis `__ >= 3.58, then run: :: @@ -218,7 +218,7 @@ Recommended dependencies ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. If installed, must be Version 2.6.2 or higher. -* `bottleneck `__: for accelerating certain types of ``nan`` +* `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, must be Version 1.2.1 or higher. @@ -234,7 +234,8 @@ Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ Pandas has many optional dependencies that are only used for specific methods. -For example, :func:`pandas.read_hdf` requires the ``pytables`` package. If the +For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while +:meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. @@ -249,27 +250,29 @@ PyTables 3.4.2 HDF5-based reading / writing SQLAlchemy 1.1.4 SQL support for databases other than sqlite SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing -blosc Compression for msgpack -fastparquet 0.2.1 Parquet reading / writing +blosc Compression for HDF5 +fastparquet 0.3.2 Parquet reading / writing gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization -openpyxl 2.4.8 Reading / writing for xlsx files +numba 0.46.0 Alternative execution engine for rolling operations +openpyxl 2.5.7 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.9.0 Parquet and feather reading / writing +pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O -s3fs 0.0.8 Amazon S3 access +s3fs 0.3.0 Amazon S3 access +tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading xlwt 1.2.0 Excel writing xsel Clipboard I/O on linux -zlib Compression for msgpack +zlib Compression for HDF5 ========================= ================== ============================================================= .. _optional_html: @@ -301,3 +304,4 @@ top-level :func:`~pandas.read_html` function: .. _html5lib: https://github.com/html5lib/html5lib-python .. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup .. _lxml: http://lxml.de +.. _tabulate: https://github.com/astanin/python-tabulate diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index ec76c60f24257..d8a40c5406dee 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -57,8 +57,7 @@ Here are just a few of the things that pandas does well: Excel files, databases, and saving / loading data from the ultrafast **HDF5 format** - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, moving window linear regressions, - date shifting and lagging, etc. + conversion, moving window statistics, date shifting and lagging. Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments. For data diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 212f3636d0a98..1ed0e8f635b58 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -15,7 +15,7 @@ pandas' own :ref:`10 Minutes to pandas<10min>`. More complex recipes are in the :ref:`Cookbook`. -A handy pandas `cheat sheet `_. +A handy pandas `cheat sheet `_. Community guides ================ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 9ec330c956ff1..10705787dfedf 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -73,6 +73,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`user_guide/missing_data` * :doc:`user_guide/categorical` * :doc:`user_guide/integer_na` + * :doc:`user_guide/boolean` * :doc:`user_guide/visualization` * :doc:`user_guide/computation` * :doc:`user_guide/groupby` @@ -108,6 +109,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`development/index` * :doc:`development/contributing` + * :doc:`development/code_style` * :doc:`development/internals` * :doc:`development/extending` * :doc:`development/developer` diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 0c435e06ac57f..c71350ecd73b3 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -12,7 +12,8 @@ For most data types, pandas uses NumPy arrays as the concrete objects contained with a :class:`Index`, :class:`Series`, or :class:`DataFrame`. -For some data types, pandas extends NumPy's type system. +For some data types, pandas extends NumPy's type system. String aliases for these types +can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= Kind of Data Pandas Data Type Scalar Array @@ -25,6 +26,7 @@ Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.array Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical` Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse` Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string` +Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). @@ -442,13 +444,13 @@ Sparse data ----------- Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may -be stored efficiently as a :class:`SparseArray`. +be stored efficiently as a :class:`arrays.SparseArray`. .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - SparseArray + arrays.SparseArray .. autosummary:: :toctree: api/ @@ -485,6 +487,28 @@ The ``Series.str`` accessor is available for ``Series`` backed by a :class:`arra See :ref:`api.series.str` for more. +.. _api.arrays.bool: + +Boolean data with missing values +-------------------------------- + +The boolean dtype (with the alias ``"boolean"``) provides support for storing +boolean data (True, False values) with missing values, which is not possible +with a bool :class:`numpy.ndarray`. + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + arrays.BooleanArray + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BooleanDtype + + .. Dtype attributes which are manually listed in their docstrings: including .. it here to make sure a docstring page is built for them diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..c072237850d82 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -59,3 +59,16 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + +Additionally, we have some utility methods for ensuring your object +behaves correctly. + +.. autosummary:: + :toctree: api/ + + api.indexers.check_bool_array_indexer + + +The sentinel ``pandas.api.extensions.no_default`` is used as the default +value in some methods. Use an ``is`` comparison to check if the user +provides a non-default value. diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4b5faed0f4d2d..01aa6c60e3b2f 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -28,19 +28,14 @@ Attributes and underlying data :toctree: api/ DataFrame.dtypes - DataFrame.ftypes - DataFrame.get_dtype_counts - DataFrame.get_ftype_counts DataFrame.select_dtypes DataFrame.values - DataFrame.get_values DataFrame.axes DataFrame.ndim DataFrame.size DataFrame.shape DataFrame.memory_usage DataFrame.empty - DataFrame.is_copy Conversion ~~~~~~~~~~ @@ -142,9 +137,6 @@ Computations / descriptive stats DataFrame.all DataFrame.any DataFrame.clip - DataFrame.clip_lower - DataFrame.clip_upper - DataFrame.compound DataFrame.corr DataFrame.corrwith DataFrame.count @@ -281,6 +273,8 @@ Metadata :attr:`DataFrame.attrs` is a dictionary for storing global metadata for this DataFrame. +.. warning:: ``DataFrame.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ @@ -351,7 +345,6 @@ Serialization / IO / conversion :toctree: api/ DataFrame.from_dict - DataFrame.from_items DataFrame.from_records DataFrame.info DataFrame.to_parquet @@ -366,10 +359,9 @@ Serialization / IO / conversion DataFrame.to_feather DataFrame.to_latex DataFrame.to_stata - DataFrame.to_msgpack DataFrame.to_gbq DataFrame.to_records - DataFrame.to_dense DataFrame.to_string DataFrame.to_clipboard + DataFrame.to_markdown DataFrame.style diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 9c69770c0f1b7..0d9e0b0f4c668 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -18,6 +18,8 @@ Working with options set_option option_context +.. _api.general.testing: + Testing functions ----------------- .. autosummary:: @@ -26,6 +28,7 @@ Testing functions testing.assert_frame_equal testing.assert_series_equal testing.assert_index_equal + testing.assert_extension_array_equal Exceptions and warnings ----------------------- @@ -97,13 +100,11 @@ Scalar introspection api.types.is_bool api.types.is_categorical api.types.is_complex - api.types.is_datetimetz api.types.is_float api.types.is_hashable api.types.is_integer api.types.is_interval api.types.is_number - api.types.is_period api.types.is_re api.types.is_re_compilable api.types.is_scalar diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 12ca318c815d3..9d5649c37e92f 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -49,7 +49,6 @@ public functions related to data types in pandas. api/pandas.DataFrame.blocks api/pandas.DataFrame.as_matrix - api/pandas.DataFrame.ix api/pandas.Index.asi8 api/pandas.Index.data api/pandas.Index.flags @@ -60,7 +59,6 @@ public functions related to data types in pandas. api/pandas.Series.asobject api/pandas.Series.blocks api/pandas.Series.from_array - api/pandas.Series.ix api/pandas.Series.imag api/pandas.Series.real diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index dd59a99b3df9e..ab6ea5aea6c61 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -32,7 +32,6 @@ Properties Index.has_duplicates Index.hasnans Index.dtype - Index.dtype_str Index.inferred_type Index.is_all_dates Index.shape @@ -42,9 +41,6 @@ Properties Index.ndim Index.size Index.empty - Index.strides - Index.itemsize - Index.base Index.T Index.memory_usage @@ -93,7 +89,6 @@ Compatibility with MultiIndex :toctree: api/ Index.set_names - Index.is_lexsorted_for_tuple Index.droplevel Missing values @@ -156,8 +151,6 @@ Selecting Index.asof Index.asof_locs - Index.contains - Index.get_duplicates Index.get_indexer Index.get_indexer_for Index.get_indexer_non_unique @@ -165,7 +158,6 @@ Selecting Index.get_loc Index.get_slice_bound Index.get_value - Index.get_values Index.isin Index.slice_indexer Index.slice_locs @@ -306,7 +298,6 @@ MultiIndex components MultiIndex.set_levels MultiIndex.set_codes - MultiIndex.to_hierarchical MultiIndex.to_flat_index MultiIndex.to_frame MultiIndex.is_lexsorted @@ -322,6 +313,7 @@ MultiIndex selecting :toctree: api/ MultiIndex.get_loc + MultiIndex.get_locs MultiIndex.get_loc_level MultiIndex.get_indexer MultiIndex.get_level_values diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 91f4942d03b0d..0037d4a4410c3 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -22,7 +22,6 @@ Flat file read_table read_csv read_fwf - read_msgpack Clipboard ~~~~~~~~~ @@ -51,13 +50,13 @@ JSON :toctree: api/ read_json + json_normalize .. currentmodule:: pandas.io.json .. autosummary:: :toctree: api/ - json_normalize build_table_schema .. currentmodule:: pandas @@ -98,6 +97,13 @@ Parquet read_parquet +ORC +~~~ +.. autosummary:: + :toctree: api/ + + read_orc + SAS ~~~ .. autosummary:: @@ -140,7 +146,6 @@ STATA .. autosummary:: :toctree: api/ - StataReader.data StataReader.data_label StataReader.value_labels StataReader.variable_labels diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index 4a58055f1c955..fc1c6d6bd6d47 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -35,6 +35,8 @@ Methods DateOffset.copy DateOffset.isAnchored DateOffset.onOffset + DateOffset.is_anchored + DateOffset.is_on_offset BusinessDay ----------- @@ -65,6 +67,8 @@ Methods BusinessDay.copy BusinessDay.isAnchored BusinessDay.onOffset + BusinessDay.is_anchored + BusinessDay.is_on_offset BusinessHour ------------ @@ -94,6 +98,8 @@ Methods BusinessHour.copy BusinessHour.isAnchored BusinessHour.onOffset + BusinessHour.is_anchored + BusinessHour.is_on_offset CustomBusinessDay ----------------- @@ -123,6 +129,8 @@ Methods CustomBusinessDay.copy CustomBusinessDay.isAnchored CustomBusinessDay.onOffset + CustomBusinessDay.is_anchored + CustomBusinessDay.is_on_offset CustomBusinessHour ------------------ @@ -152,6 +160,8 @@ Methods CustomBusinessHour.copy CustomBusinessHour.isAnchored CustomBusinessHour.onOffset + CustomBusinessHour.is_anchored + CustomBusinessHour.is_on_offset MonthOffset ----------- @@ -182,6 +192,8 @@ Methods MonthOffset.copy MonthOffset.isAnchored MonthOffset.onOffset + MonthOffset.is_anchored + MonthOffset.is_on_offset MonthEnd -------- @@ -212,6 +224,8 @@ Methods MonthEnd.copy MonthEnd.isAnchored MonthEnd.onOffset + MonthEnd.is_anchored + MonthEnd.is_on_offset MonthBegin ---------- @@ -242,6 +256,8 @@ Methods MonthBegin.copy MonthBegin.isAnchored MonthBegin.onOffset + MonthBegin.is_anchored + MonthBegin.is_on_offset BusinessMonthEnd ---------------- @@ -272,6 +288,8 @@ Methods BusinessMonthEnd.copy BusinessMonthEnd.isAnchored BusinessMonthEnd.onOffset + BusinessMonthEnd.is_anchored + BusinessMonthEnd.is_on_offset BusinessMonthBegin ------------------ @@ -302,6 +320,8 @@ Methods BusinessMonthBegin.copy BusinessMonthBegin.isAnchored BusinessMonthBegin.onOffset + BusinessMonthBegin.is_anchored + BusinessMonthBegin.is_on_offset CustomBusinessMonthEnd ---------------------- @@ -332,6 +352,8 @@ Methods CustomBusinessMonthEnd.copy CustomBusinessMonthEnd.isAnchored CustomBusinessMonthEnd.onOffset + CustomBusinessMonthEnd.is_anchored + CustomBusinessMonthEnd.is_on_offset CustomBusinessMonthBegin ------------------------ @@ -362,6 +384,8 @@ Methods CustomBusinessMonthBegin.copy CustomBusinessMonthBegin.isAnchored CustomBusinessMonthBegin.onOffset + CustomBusinessMonthBegin.is_anchored + CustomBusinessMonthBegin.is_on_offset SemiMonthOffset --------------- @@ -392,6 +416,8 @@ Methods SemiMonthOffset.copy SemiMonthOffset.isAnchored SemiMonthOffset.onOffset + SemiMonthOffset.is_anchored + SemiMonthOffset.is_on_offset SemiMonthEnd ------------ @@ -422,6 +448,8 @@ Methods SemiMonthEnd.copy SemiMonthEnd.isAnchored SemiMonthEnd.onOffset + SemiMonthEnd.is_anchored + SemiMonthEnd.is_on_offset SemiMonthBegin -------------- @@ -452,6 +480,8 @@ Methods SemiMonthBegin.copy SemiMonthBegin.isAnchored SemiMonthBegin.onOffset + SemiMonthBegin.is_anchored + SemiMonthBegin.is_on_offset Week ---- @@ -482,6 +512,8 @@ Methods Week.copy Week.isAnchored Week.onOffset + Week.is_anchored + Week.is_on_offset WeekOfMonth ----------- @@ -511,6 +543,8 @@ Methods WeekOfMonth.copy WeekOfMonth.isAnchored WeekOfMonth.onOffset + WeekOfMonth.is_anchored + WeekOfMonth.is_on_offset LastWeekOfMonth --------------- @@ -540,6 +574,8 @@ Methods LastWeekOfMonth.copy LastWeekOfMonth.isAnchored LastWeekOfMonth.onOffset + LastWeekOfMonth.is_anchored + LastWeekOfMonth.is_on_offset QuarterOffset ------------- @@ -570,6 +606,8 @@ Methods QuarterOffset.copy QuarterOffset.isAnchored QuarterOffset.onOffset + QuarterOffset.is_anchored + QuarterOffset.is_on_offset BQuarterEnd ----------- @@ -600,6 +638,8 @@ Methods BQuarterEnd.copy BQuarterEnd.isAnchored BQuarterEnd.onOffset + BQuarterEnd.is_anchored + BQuarterEnd.is_on_offset BQuarterBegin ------------- @@ -630,6 +670,8 @@ Methods BQuarterBegin.copy BQuarterBegin.isAnchored BQuarterBegin.onOffset + BQuarterBegin.is_anchored + BQuarterBegin.is_on_offset QuarterEnd ---------- @@ -660,6 +702,8 @@ Methods QuarterEnd.copy QuarterEnd.isAnchored QuarterEnd.onOffset + QuarterEnd.is_anchored + QuarterEnd.is_on_offset QuarterBegin ------------ @@ -690,6 +734,8 @@ Methods QuarterBegin.copy QuarterBegin.isAnchored QuarterBegin.onOffset + QuarterBegin.is_anchored + QuarterBegin.is_on_offset YearOffset ---------- @@ -720,6 +766,8 @@ Methods YearOffset.copy YearOffset.isAnchored YearOffset.onOffset + YearOffset.is_anchored + YearOffset.is_on_offset BYearEnd -------- @@ -750,6 +798,8 @@ Methods BYearEnd.copy BYearEnd.isAnchored BYearEnd.onOffset + BYearEnd.is_anchored + BYearEnd.is_on_offset BYearBegin ---------- @@ -780,6 +830,8 @@ Methods BYearBegin.copy BYearBegin.isAnchored BYearBegin.onOffset + BYearBegin.is_anchored + BYearBegin.is_on_offset YearEnd ------- @@ -810,6 +862,8 @@ Methods YearEnd.copy YearEnd.isAnchored YearEnd.onOffset + YearEnd.is_anchored + YearEnd.is_on_offset YearBegin --------- @@ -840,6 +894,8 @@ Methods YearBegin.copy YearBegin.isAnchored YearBegin.onOffset + YearBegin.is_anchored + YearBegin.is_on_offset FY5253 ------ @@ -871,6 +927,8 @@ Methods FY5253.get_year_end FY5253.isAnchored FY5253.onOffset + FY5253.is_anchored + FY5253.is_on_offset FY5253Quarter ------------- @@ -901,6 +959,8 @@ Methods FY5253Quarter.get_weeks FY5253Quarter.isAnchored FY5253Quarter.onOffset + FY5253Quarter.is_anchored + FY5253Quarter.is_on_offset FY5253Quarter.year_has_extra_week Easter @@ -931,6 +991,8 @@ Methods Easter.copy Easter.isAnchored Easter.onOffset + Easter.is_anchored + Easter.is_on_offset Tick ---- @@ -960,6 +1022,8 @@ Methods Tick.copy Tick.isAnchored Tick.onOffset + Tick.is_anchored + Tick.is_on_offset Day --- @@ -989,6 +1053,8 @@ Methods Day.copy Day.isAnchored Day.onOffset + Day.is_anchored + Day.is_on_offset Hour ---- @@ -1018,6 +1084,8 @@ Methods Hour.copy Hour.isAnchored Hour.onOffset + Hour.is_anchored + Hour.is_on_offset Minute ------ @@ -1047,6 +1115,8 @@ Methods Minute.copy Minute.isAnchored Minute.onOffset + Minute.is_anchored + Minute.is_on_offset Second ------ @@ -1076,6 +1146,8 @@ Methods Second.copy Second.isAnchored Second.onOffset + Second.is_anchored + Second.is_on_offset Milli ----- @@ -1105,6 +1177,8 @@ Methods Milli.copy Milli.isAnchored Milli.onOffset + Milli.is_anchored + Milli.is_on_offset Micro ----- @@ -1134,6 +1208,8 @@ Methods Micro.copy Micro.isAnchored Micro.onOffset + Micro.is_anchored + Micro.is_on_offset Nano ---- @@ -1163,6 +1239,8 @@ Methods Nano.copy Nano.isAnchored Nano.onOffset + Nano.is_anchored + Nano.is_on_offset BDay ---- @@ -1195,6 +1273,8 @@ Methods BDay.copy BDay.isAnchored BDay.onOffset + BDay.is_anchored + BDay.is_on_offset BDay.rollback BDay.rollforward @@ -1228,6 +1308,8 @@ Methods BMonthEnd.copy BMonthEnd.isAnchored BMonthEnd.onOffset + BMonthEnd.is_anchored + BMonthEnd.is_on_offset BMonthEnd.rollback BMonthEnd.rollforward @@ -1261,6 +1343,8 @@ Methods BMonthBegin.copy BMonthBegin.isAnchored BMonthBegin.onOffset + BMonthBegin.is_anchored + BMonthBegin.is_on_offset BMonthBegin.rollback BMonthBegin.rollforward @@ -1298,6 +1382,8 @@ Methods CBMonthEnd.copy CBMonthEnd.isAnchored CBMonthEnd.onOffset + CBMonthEnd.is_anchored + CBMonthEnd.is_on_offset CBMonthEnd.rollback CBMonthEnd.rollforward @@ -1335,6 +1421,8 @@ Methods CBMonthBegin.copy CBMonthBegin.isAnchored CBMonthBegin.onOffset + CBMonthBegin.is_anchored + CBMonthBegin.is_on_offset CBMonthBegin.rollback CBMonthBegin.rollforward @@ -1369,6 +1457,8 @@ Methods CDay.copy CDay.isAnchored CDay.onOffset + CDay.is_anchored + CDay.is_on_offset CDay.rollback CDay.rollforward diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 59910ba357130..4ad6a7b014532 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -29,25 +29,16 @@ Attributes Series.array Series.values Series.dtype - Series.ftype Series.shape Series.nbytes Series.ndim Series.size - Series.strides - Series.itemsize - Series.base Series.T Series.memory_usage Series.hasnans - Series.flags Series.empty Series.dtypes - Series.ftypes - Series.data - Series.is_copy Series.name - Series.put Conversion ---------- @@ -62,7 +53,6 @@ Conversion Series.to_period Series.to_timestamp Series.to_list - Series.get_values Series.__array__ Indexing, iteration @@ -148,8 +138,6 @@ Computations / descriptive stats Series.autocorr Series.between Series.clip - Series.clip_lower - Series.clip_upper Series.corr Series.count Series.cov @@ -186,7 +174,6 @@ Computations / descriptive stats Series.is_monotonic_increasing Series.is_monotonic_decreasing Series.value_counts - Series.compound Reindexing / selection / label manipulation ------------------------------------------- @@ -538,6 +525,8 @@ Metadata :attr:`Series.attrs` is a dictionary for storing global metadata for this Series. +.. warning:: ``Series.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ @@ -587,9 +576,8 @@ Serialization / IO / conversion Series.to_xarray Series.to_hdf Series.to_sql - Series.to_msgpack Series.to_json - Series.to_dense Series.to_string Series.to_clipboard Series.to_latex + Series.to_markdown diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 3d155535e2585..24a47336b0522 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -41,6 +41,7 @@ Style application Styler.set_caption Styler.set_properties Styler.set_uuid + Styler.set_na_rep Styler.clear Styler.pipe diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 2f6addf607877..3db1aa12a4275 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -34,6 +34,8 @@ Standard moving window functions Rolling.quantile Window.mean Window.sum + Window.var + Window.std .. _api.functions_expanding: @@ -72,3 +74,14 @@ Exponentially-weighted moving window functions EWM.var EWM.corr EWM.cov + +Window Indexer +-------------- +.. currentmodule:: pandas + +Base class for defining custom window boundaries. + +.. autosummary:: + :toctree: api/ + + api.indexers.BaseIndexer diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index c6eadd2adadce..d6f5c0c758b60 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -554,6 +554,27 @@ index. Both ``rename`` and ``rename_axis`` support specifying a dictionary, ``Series`` or a mapping function to map labels/names to new values. +When working with an ``Index`` object directly, rather than via a ``DataFrame``, +:meth:`Index.set_names` can be used to change the names. + +.. ipython:: python + + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.names + + mi2 = mi.rename("new name", level=0) + mi2 + + +You cannot set the names of the MultiIndex via a level. + +.. ipython:: python + :okexcept: + + mi.levels[0].name = "name via level" + +Use :meth:`Index.set_names` instead. + Sorting a ``MultiIndex`` ------------------------ diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst new file mode 100644 index 0000000000000..5276bc6142206 --- /dev/null +++ b/doc/source/user_guide/boolean.rst @@ -0,0 +1,102 @@ +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import pandas as pd + import numpy as np + +.. _boolean: + +************************** +Nullable Boolean Data Type +************************** + +.. versionadded:: 1.0.0 + + +.. _boolean.indexing: + +Indexing with NA values +----------------------- + +pandas does not allow indexing with NA values. Attempting to do so +will raise a ``ValueError``. + +.. ipython:: python + :okexcept: + + s = pd.Series([1, 2, 3]) + mask = pd.array([True, False, pd.NA], dtype="boolean") + s[mask] + +The missing values will need to be explicitly filled with True or False prior +to using the array as a mask. + +.. ipython:: python + + s[mask.fillna(False)] + +.. _boolean.kleene: + +Kleene Logical Operations +------------------------- + +:class:`arrays.BooleanArray` implements `Kleene Logic`_ (sometimes called three-value logic) for +logical operations like ``&`` (and), ``|`` (or) and ``^`` (exclusive-or). + +This table demonstrates the results for every combination. These operations are symmetrical, +so flipping the left- and right-hand side makes no difference in the result. + +================= ========= +Expression Result +================= ========= +``True & True`` ``True`` +``True & False`` ``False`` +``True & NA`` ``NA`` +``False & False`` ``False`` +``False & NA`` ``False`` +``NA & NA`` ``NA`` +``True | True`` ``True`` +``True | False`` ``True`` +``True | NA`` ``True`` +``False | False`` ``False`` +``False | NA`` ``NA`` +``NA | NA`` ``NA`` +``True ^ True`` ``False`` +``True ^ False`` ``True`` +``True ^ NA`` ``NA`` +``False ^ False`` ``False`` +``False ^ NA`` ``NA`` +``NA ^ NA`` ``NA`` +================= ========= + +When an ``NA`` is present in an operation, the output value is ``NA`` only if +the result cannot be determined solely based on the other input. For example, +``True | NA`` is ``True``, because both ``True | True`` and ``True | False`` +are ``True``. In that case, we don't actually need to consider the value +of the ``NA``. + +On the other hand, ``True & NA`` is ``NA``. The result depends on whether +the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``, +but ``True & False`` is ``False``, so we can't determine the output. + + +This differs from how ``np.nan`` behaves in logical operations. Pandas treated +``np.nan`` is *always false in the output*. + +In ``or`` + +.. ipython:: python + + pd.Series([True, False, np.nan], dtype="object") | True + pd.Series([True, False, np.nan], dtype="boolean") | True + +In ``and`` + +.. ipython:: python + + pd.Series([True, False, np.nan], dtype="object") & True + pd.Series([True, False, np.nan], dtype="boolean") & True + +.. _Kleene Logic: https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6651f656ae45d..a55326db748fd 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -797,37 +797,52 @@ Assigning a ``Categorical`` to parts of a column of other types will use the val df.dtypes .. _categorical.merge: +.. _categorical.concat: -Merging -~~~~~~~ +Merging / Concatenation +~~~~~~~~~~~~~~~~~~~~~~~ -You can concat two ``DataFrames`` containing categorical data together, -but the categories of these categoricals need to be the same: +By default, combining ``Series`` or ``DataFrames`` which contain the same +categories results in ``category`` dtype, otherwise results will depend on the +dtype of the underlying categories. Merges that result in non-categorical +dtypes will likely have higher memory usage. Use ``.astype`` or +``union_categoricals`` to ensure ``category`` results. .. ipython:: python - cat = pd.Series(["a", "b"], dtype="category") - vals = [1, 2] - df = pd.DataFrame({"cats": cat, "vals": vals}) - res = pd.concat([df, df]) - res - res.dtypes + from pandas.api.types import union_categoricals -In this case the categories are not the same, and therefore an error is raised: + # same categories + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['a', 'b', 'a'], dtype='category') + pd.concat([s1, s2]) -.. ipython:: python + # different categories + s3 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s3]) - df_different = df.copy() - df_different["cats"].cat.categories = ["c", "d"] - try: - pd.concat([df, df_different]) - except ValueError as e: - print("ValueError:", str(e)) + # Output dtype is inferred based on categories values + int_cats = pd.Series([1, 2], dtype="category") + float_cats = pd.Series([3.0, 4.0], dtype="category") + pd.concat([int_cats, float_cats]) + + pd.concat([s1, s3]).astype('category') + union_categoricals([s1.array, s3.array]) -The same applies to ``df.append(df_different)``. +The following table summarizes the results of merging ``Categoricals``: -See also the section on :ref:`merge dtypes` for notes about preserving merge dtypes and performance. ++-------------------+------------------------+----------------------+-----------------------------+ +| arg1 | arg2 | identical | result | ++===================+========================+======================+=============================+ +| category | category | True | category | ++-------------------+------------------------+----------------------+-----------------------------+ +| category (object) | category (object) | False | object (dtype is inferred) | ++-------------------+------------------------+----------------------+-----------------------------+ +| category (int) | category (float) | False | float (dtype is inferred) | ++-------------------+------------------------+----------------------+-----------------------------+ +See also the section on :ref:`merge dtypes` for notes about +preserving merge dtypes and performance. .. _categorical.union: @@ -918,46 +933,6 @@ the resulting array will always be a plain ``Categorical``: # "b" is coded to 0 throughout, same as c1, different from c2 c.codes -.. _categorical.concat: - -Concatenation -~~~~~~~~~~~~~ - -This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. - -By default, ``Series`` or ``DataFrame`` concatenation which contains the same categories -results in ``category`` dtype, otherwise results in ``object`` dtype. -Use ``.astype`` or ``union_categoricals`` to get ``category`` result. - -.. ipython:: python - - # same categories - s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['a', 'b', 'a'], dtype='category') - pd.concat([s1, s2]) - - # different categories - s3 = pd.Series(['b', 'c'], dtype='category') - pd.concat([s1, s3]) - - pd.concat([s1, s3]).astype('category') - union_categoricals([s1.array, s3.array]) - - -Following table summarizes the results of ``Categoricals`` related concatenations. - -+----------+--------------------------------------------------------+----------------------------+ -| arg1 | arg2 | result | -+==========+========================================================+============================+ -| category | category (identical categories) | category | -+----------+--------------------------------------------------------+----------------------------+ -| category | category (different categories, both not ordered) | object (dtype is inferred) | -+----------+--------------------------------------------------------+----------------------------+ -| category | category (different categories, either one is ordered) | object (dtype is inferred) | -+----------+--------------------------------------------------------+----------------------------+ -| category | not category | object (dtype is inferred) | -+----------+--------------------------------------------------------+----------------------------+ - Getting data in/out ------------------- diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index bc00cd7f13e13..a2150c207c0b0 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -321,6 +321,11 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _stats.rolling_apply: + +Rolling Apply +~~~~~~~~~~~~~ + The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function that produces a single value from an ndarray input. Suppose we wanted to @@ -334,6 +339,48 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +.. versionadded:: 1.0 + +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +1. If ``func`` is a standard Python function, the engine will `JIT `__ +the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +2. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. + +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, ``rolling`` objects will cache + the function and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + .. _stats.rolling_window: Rolling windows @@ -466,6 +513,64 @@ default of the index) in a DataFrame. dft dft.rolling('2s', on='foo').sum() +.. _stats.custom_rolling_window: + +Custom window rolling +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts +a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. +The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns +a tuple of two arrays, the first being the starting indices of the windows and second being the +ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` +and will automatically be passed to ``get_window_bounds`` and the defined method must +always accept these arguments. + +For example, if we have the following ``DataFrame``: + +.. ipython:: python + + use_expanding = [True, False, True, False, True] + use_expanding + df = pd.DataFrame({'values': range(5)}) + df + +and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size +1, we can create the following ``BaseIndexer``: + +.. code-block:: ipython + + In [2]: from pandas.api.indexers import BaseIndexer + ...: + ...: class CustomIndexer(BaseIndexer): + ...: + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + ...: + + In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [4]: df.rolling(indexer).sum() + Out[4]: + values + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 10.0 + + .. _stats.rolling_window.endpoints: Rolling window endpoints diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index c9d3bc3a28c70..f581d183b9413 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -406,10 +406,10 @@ Levels ****** `Prepending a level to a multiindex -`__ +`__ `Flatten Hierarchical columns -`__ +`__ .. _cookbook.missing_data: @@ -430,13 +430,13 @@ Fill forward a reversed timeseries df.reindex(df.index[::-1]).ffill() `cumsum reset at NaN values -`__ +`__ Replace ******* `Using replace with backrefs -`__ +`__ .. _cookbook.grouping: @@ -446,7 +446,7 @@ Grouping The :ref:`grouping ` docs. `Basic grouping with apply -`__ +`__ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to all the columns @@ -462,7 +462,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group -`__ +`__ .. ipython:: python @@ -470,7 +470,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.get_group('cat') `Apply to different items in a group -`__ +`__ .. ipython:: python @@ -486,7 +486,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to expected_df `Expanding apply -`__ +`__ .. ipython:: python @@ -502,7 +502,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to `Replacing some values with mean of the rest of a group -`__ +`__ .. ipython:: python @@ -516,7 +516,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.transform(replace) `Sort groups by aggregated data -`__ +`__ .. ipython:: python @@ -533,7 +533,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to sorted_df `Create multiple aggregated columns -`__ +`__ .. ipython:: python @@ -550,7 +550,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to ts `Create a value counts column and reassign back to the DataFrame -`__ +`__ .. ipython:: python @@ -561,7 +561,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Shift groups of the values in a column based on the index -`__ +`__ .. ipython:: python @@ -575,7 +575,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Select row with maximum value from each group -`__ +`__ .. ipython:: python @@ -587,7 +587,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df_count `Grouping like Python's itertools.groupby -`__ +`__ .. ipython:: python @@ -599,19 +599,19 @@ Expanding data ************** `Alignment and to-date -`__ +`__ `Rolling Computation window based on values instead of counts -`__ +`__ `Rolling Mean by Time Interval -`__ +`__ Splitting ********* `Splitting a frame -`__ +`__ Create a list of dataframes, split using a delineation based on logic included in rows. @@ -635,7 +635,7 @@ Pivot The :ref:`Pivot ` docs. `Partial sums and subtotals -`__ +`__ .. ipython:: python @@ -649,7 +649,7 @@ The :ref:`Pivot ` docs. table.stack('City') `Frequency table like plyr in R -`__ +`__ .. ipython:: python @@ -675,7 +675,7 @@ The :ref:`Pivot ` docs. 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data -`__ +`__ To create year and month cross tabulation: @@ -691,7 +691,7 @@ Apply ***** `Rolling apply to organize - Turning embedded lists into a MultiIndex frame -`__ +`__ .. ipython:: python @@ -707,7 +707,7 @@ Apply df_orgz `Rolling apply with a DataFrame returning a Series -`__ +`__ Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned @@ -727,7 +727,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc s `Rolling apply with a DataFrame returning a Scalar -`__ +`__ Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price) @@ -753,26 +753,26 @@ Timeseries ---------- `Between times -`__ +`__ `Using indexer between time -`__ +`__ `Constructing a datetime range that excludes weekends and includes only certain times -`__ +`__ `Vectorized Lookup -`__ +`__ `Aggregation and plotting time series `__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? -`__ +`__ `Dealing with duplicates when reindexing a timeseries to a specified frequency -`__ +`__ Calculate the first day of the month for each entry in a DatetimeIndex @@ -795,7 +795,7 @@ The :ref:`Resample ` docs. `__ `Valid frequency arguments to Grouper -`__ +`__ `Grouping using a MultiIndex `__ @@ -804,15 +804,15 @@ The :ref:`Resample ` docs. `__ `Resampling with custom periods -`__ +`__ `Resample intraday frame without adding new days -`__ +`__ `Resample minute data -`__ +`__ -`Resample with groupby `__ +`Resample with groupby `__ .. _cookbook.merge: @@ -822,7 +822,7 @@ Merge The :ref:`Concat ` docs. The :ref:`Join ` docs. `Append two dataframes with overlapping index (emulate R rbind) -`__ +`__ .. ipython:: python @@ -855,16 +855,16 @@ Depending on df construction, ``ignore_index`` may be needed suffixes=('_L', '_R')) `How to set the index and join -`__ +`__ `KDB like asof join -`__ +`__ `Join with a criteria based on the values -`__ +`__ `Using searchsorted to merge based on values inside a range -`__ +`__ .. _cookbook.plotting: @@ -874,31 +874,31 @@ Plotting The :ref:`Plotting ` docs. `Make Matplotlib look like R -`__ +`__ `Setting x-axis major and minor labels -`__ +`__ `Plotting multiple charts in an ipython notebook -`__ +`__ `Creating a multi-line plot -`__ +`__ `Plotting a heatmap -`__ +`__ `Annotate a time-series plot -`__ +`__ `Annotate a time-series plot #2 -`__ +`__ `Generate Embedded plots in excel files using Pandas, Vincent and xlsxwriter `__ `Boxplot for each quartile of a stratifying variable -`__ +`__ .. ipython:: python @@ -918,7 +918,7 @@ Data In/Out ----------- `Performance comparison of SQL vs HDF5 -`__ +`__ .. _cookbook.csv: @@ -930,25 +930,25 @@ The :ref:`CSV ` docs `read_csv in action `__ `appending to a csv -`__ +`__ `Reading a csv chunk-by-chunk -`__ +`__ `Reading only certain rows of a csv chunk-by-chunk -`__ +`__ `Reading the first few lines of a frame -`__ +`__ Reading a file that is compressed but not by ``gzip/bz2`` (the native compressed formats which ``read_csv`` understands). This example shows a ``WinZipped`` file, but is a general application of opening the file within a context manager and using that handle to read. `See here -`__ +`__ `Inferring dtypes from a file -`__ +`__ `Dealing with bad lines `__ @@ -960,7 +960,7 @@ using that handle to read. `__ `Write a multi-row index CSV without writing duplicates -`__ +`__ .. _cookbook.csv.multiple_files: @@ -1069,7 +1069,7 @@ SQL The :ref:`SQL ` docs `Reading from databases with SQL -`__ +`__ .. _cookbook.excel: @@ -1079,7 +1079,7 @@ Excel The :ref:`Excel ` docs `Reading from a filelike handle -`__ +`__ `Modifying formatting in XlsxWriter output `__ @@ -1090,7 +1090,7 @@ HTML **** `Reading HTML tables from a server that cannot handle the default request -header `__ +header `__ .. _cookbook.hdf: @@ -1100,54 +1100,54 @@ HDFStore The :ref:`HDFStores ` docs `Simple queries with a Timestamp Index -`__ +`__ `Managing heterogeneous data using a linked multiple table hierarchy `__ `Merging on-disk tables with millions of rows -`__ +`__ `Avoiding inconsistencies when writing to a store from multiple processes/threads -`__ +`__ De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from csv file and creating a store by chunks, with date parsing as well. `See here -`__ +`__ `Creating a store chunk-by-chunk from a csv file -`__ +`__ `Appending to a store, while creating a unique index -`__ +`__ `Large Data work flows -`__ +`__ `Reading in a sequence of files, then providing a global unique index to a store while appending -`__ +`__ `Groupby on a HDFStore with low group density -`__ +`__ `Groupby on a HDFStore with high group density -`__ +`__ `Hierarchical queries on a HDFStore -`__ +`__ `Counting with a HDFStore -`__ +`__ `Troubleshoot HDFStore exceptions -`__ +`__ `Setting min_itemsize with strings -`__ +`__ `Using ptrepack to create a completely-sorted-index on a store -`__ +`__ Storing Attributes to a group node @@ -1229,7 +1229,7 @@ in the frame: The offsets of the structure elements may be different depending on the architecture of the machine on which the file was created. Using a raw binary file format like this for general data storage is not recommended, as - it is not cross platform. We recommended either HDF5 or msgpack, both of + it is not cross platform. We recommended either HDF5 or parquet, both of which are supported by pandas' IO facilities. Computation @@ -1305,7 +1305,7 @@ The :ref:`Timedeltas ` docs. datetime.timedelta(minutes=5) + s `Adding and subtracting deltas and dates -`__ +`__ .. ipython:: python @@ -1322,7 +1322,7 @@ The :ref:`Timedeltas ` docs. df.dtypes `Another example -`__ +`__ Values can be set to NaT using np.nan, similar to datetime diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index b86961a71433b..30b1c0b4eac0d 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -30,6 +30,7 @@ Further information on any specific method can be obtained in the missing_data categorical integer_na + boolean visualization computation groupby diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index cf55ce0c9a6d4..a8cdf4a61073d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -374,7 +374,7 @@ For getting values with a boolean array: df1.loc['a'] > 0 df1.loc[:, df1.loc['a'] > 0] -For getting a value explicitly (equivalent to deprecated ``df.get_value('a','A')``): +For getting a value explicitly: .. ipython:: python @@ -668,7 +668,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 97b9c2f95dc50..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,10 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with @@ -23,14 +27,16 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` -implemented within pandas. It is not the default dtype for integers, and will not be inferred; -you must explicitly pass the dtype into :meth:`array` or :class:`Series`: +implemented within pandas. .. ipython:: python - arr = pd.array([1, 2, np.nan], dtype=pd.Int64Dtype()) + arr = pd.array([1, 2, None], dtype=pd.Int64Dtype()) arr Or the string alias ``"Int64"`` (note the capital ``"I"``, to differentiate from @@ -40,6 +46,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -50,24 +62,46 @@ NumPy array. You can also pass the list-like object to the :class:`Series` constructor with the dtype. -.. ipython:: python +.. warning:: - s = pd.Series([1, 2, np.nan], dtype="Int64") - s + Currently :meth:`pandas.array` and :meth:`pandas.Series` use different + rules for dtype inference. :meth:`pandas.array` will infer a nullable- + integer dtype -By default (if you don't specify ``dtype``), NumPy is used, and you'll end -up with a ``float64`` dtype Series: + .. ipython:: python -.. ipython:: python + pd.array([1, None]) + pd.array([1, 2]) + + For backwards-compatibility, :class:`Series` infers these as either + integer or float dtype + + .. ipython:: python + + pd.Series([1, None]) + pd.Series([1, 2]) - pd.Series([1, 2, np.nan]) + We recommend explicitly providing the dtype to avoid confusion. + + .. ipython:: python + + pd.array([1, None], dtype="Int64") + pd.Series([1, None], dtype="Int64") + + In the future, we may provide an option for :class:`Series` to infer a + nullable-integer dtype. + +Operations +---------- Operations involving an integer array will behave similar to NumPy arrays. -Missing values will be propagated, and and the data will be coerced to another +Missing values will be propagated, and the data will be coerced to another dtype if needed. .. ipython:: python + s = pd.Series([1, 2, None], dtype="Int64") + # arithmetic s + 1 @@ -105,3 +139,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 173bcf7537154..55bbf6848820b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3,15 +3,6 @@ .. currentmodule:: pandas -{{ header }} - -.. ipython:: python - :suppress: - - clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, - index=['x', 'y', 'z']) - - =============================== IO tools (text, CSV, HDF5, ...) =============================== @@ -37,13 +28,14 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` + binary;`ORC Format `__;:ref:`read_orc`; binary;`Msgpack `__;:ref:`read_msgpack`;:ref:`to_msgpack` binary;`Stata `__;:ref:`read_stata`;:ref:`to_stata` binary;`SAS `__;:ref:`read_sas`; binary;`SPSS `__;:ref:`read_spss`; binary;`Python Pickle Format `__;:ref:`read_pickle`;:ref:`to_pickle` SQL;`SQL `__;:ref:`read_sql`;:ref:`to_sql` - SQL;`Google Big Query `__;:ref:`read_gbq`;:ref:`to_gbq` + SQL;`Google BigQuery `__;:ref:`read_gbq`;:ref:`to_gbq` :ref:`Here ` is an informal performance comparison for some of these IO methods. @@ -137,7 +129,8 @@ usecols : list-like or callable, default ``None`` .. ipython:: python - from io import StringIO, BytesIO + import pandas as pd + from io import StringIO data = ('col1,col2,col3\n' 'a,b,1\n' 'a,b,2\n' @@ -360,6 +353,7 @@ columns: .. ipython:: python + import numpy as np data = ('a,b,c,d\n' '1,2,3,4\n' '5,6,7,8\n' @@ -440,7 +434,6 @@ worth trying. :suppress: import os - os.remove('foo.csv') .. _io.categorical: @@ -748,6 +741,7 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python + from io import BytesIO data = (b'word,length\n' b'Tr\xc3\xa4umen,7\n' b'Gr\xc3\xbc\xc3\x9fe,5') @@ -1159,7 +1153,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Let us consider some examples: @@ -1525,7 +1519,7 @@ rows will skip the intervening rows. .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) @@ -2072,6 +2066,8 @@ The Numpy parameter +++++++++++++++++++ .. note:: + This param has been deprecated as of version 1.0.0 and will raise a ``FutureWarning``. + This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc. If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff @@ -2094,6 +2090,7 @@ data: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2108,6 +2105,7 @@ The speedup is less noticeable for smaller datasets: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2142,27 +2140,26 @@ into a flat table. .. ipython:: python - from pandas.io.json import json_normalize data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, {'name': {'given': 'Mose', 'family': 'Regner'}}, {'id': 2, 'name': 'Faye Raker'}] - json_normalize(data) + pd.json_normalize(data) .. ipython:: python data = [{'state': 'Florida', 'shortname': 'FL', 'info': {'governor': 'Rick Scott'}, - 'counties': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}]}, + 'county': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, {'state': 'Ohio', 'shortname': 'OH', 'info': {'governor': 'John Kasich'}, - 'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] + 'county': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] - json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) + pd.json_normalize(data, 'county', ['state', 'shortname', ['info', 'governor']]) The max_level parameter provides more control over which level to end normalization. With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. @@ -2175,7 +2172,7 @@ With max_level=1 the following snippet normalizes until 1st nesting level of the 'Name': 'Name001'}}, 'Image': {'a': 'b'} }] - json_normalize(data, max_level=1) + pd.json_normalize(data, max_level=1) .. _io.jsonl: @@ -2636,7 +2633,7 @@ that contain URLs. url_df = pd.DataFrame({ 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'http://pandas.pydata.org']}) + 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) print(url_df.to_html(render_links=True)) .. ipython:: python @@ -3388,87 +3385,19 @@ The default is to 'infer': msgpack ------- -pandas supports the ``msgpack`` format for -object serialization. This is a lightweight portable binary format, similar -to binary JSON, that is highly space efficient, and provides good performance -both on the writing (serialization), and reading (deserialization). - -.. warning:: - - The msgpack format is deprecated as of 0.25 and will be removed in a future version. - It is recommended to use pyarrow for on-the-wire transmission of pandas objects. - -.. warning:: - - :func:`read_msgpack` is only guaranteed backwards compatible back to pandas version 0.20.3 - -.. ipython:: python - :okwarning: - - df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) - df.to_msgpack('foo.msg') - pd.read_msgpack('foo.msg') - s = pd.Series(np.random.rand(5), index=pd.date_range('20130101', periods=5)) - -You can pass a list of objects and you will receive them back on deserialization. - -.. ipython:: python - :okwarning: - - pd.to_msgpack('foo.msg', df, 'foo', np.array([1, 2, 3]), s) - pd.read_msgpack('foo.msg') - -You can pass ``iterator=True`` to iterate over the unpacked results: - -.. ipython:: python - :okwarning: - - for o in pd.read_msgpack('foo.msg', iterator=True): - print(o) - -You can pass ``append=True`` to the writer to append to an existing pack: - -.. ipython:: python - :okwarning: - - df.to_msgpack('foo.msg', append=True) - pd.read_msgpack('foo.msg') - -Unlike other io methods, ``to_msgpack`` is available on both a per-object basis, -``df.to_msgpack()`` and using the top-level ``pd.to_msgpack(...)`` where you -can pack arbitrary collections of Python lists, dicts, scalars, while intermixing -pandas objects. - -.. ipython:: python - :okwarning: - - pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, - {'scalar': 1.}, {'s': s}]}) - pd.read_msgpack('foo2.msg') - -.. ipython:: python - :suppress: - :okexcept: - - os.remove('foo.msg') - os.remove('foo2.msg') - -Read/write API -'''''''''''''' - -Msgpacks can also be read from and written to strings. - -.. ipython:: python - :okwarning: +pandas support for ``msgpack`` has been removed in version 1.0.0. It is recommended to use pyarrow for on-the-wire transmission of pandas objects. - df.to_msgpack() +Example pyarrow usage: -Furthermore you can concatenate the strings to produce a list of the original objects. +.. code-block:: python -.. ipython:: python - :okwarning: + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({'A': [1, 2, 3]}) + >>> context = pa.default_serialization_context() + >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() - pd.read_msgpack(df.to_msgpack() + s.to_msgpack()) +For documentation on pyarrow, see `here `__. .. _io.hdf5: @@ -3952,6 +3881,8 @@ specified in the format: ``()``, where float may be signed (and fra store.append('dftd', dftd, data_columns=True) store.select('dftd', "C<'-3.5D'") +.. _io.query_multi: + Query MultiIndex ++++++++++++++++ @@ -4678,14 +4609,12 @@ See the `Full Documentation `__. Write to a feather file. .. ipython:: python - :okwarning: df.to_feather('example.feather') Read from a feather file. .. ipython:: python - :okwarning: result = pd.read_feather('example.feather') result @@ -4723,8 +4652,11 @@ Several caveats. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message + on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0. +* The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data + type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols, + see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, @@ -4764,7 +4696,6 @@ Write to a parquet file. Read from a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet('example_fp.parquet', engine='fastparquet') result = pd.read_parquet('example_pa.parquet', engine='pyarrow') @@ -4774,7 +4705,6 @@ Read from a parquet file. Read only certain columns of a parquet file. .. ipython:: python - :okwarning: result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) @@ -4797,7 +4727,6 @@ Serializing a ``DataFrame`` to parquet may include the implicit index as one or more columns in the output file. Thus, this code: .. ipython:: python - :okwarning: df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) df.to_parquet('test.parquet', engine='pyarrow') @@ -4814,7 +4743,6 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to :func:`~pandas.DataFrame.to_parquet`: .. ipython:: python - :okwarning: df.to_parquet('test.parquet', index=False) @@ -4839,13 +4767,12 @@ Partitioning Parquet files Parquet supports partitioning of data based on the values of one or more columns. .. ipython:: python - :okwarning: df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(fname='test', engine='pyarrow', + df.to_parquet(path='test', engine='pyarrow', partition_cols=['a'], compression=None) -The `fname` specifies the parent directory to which data will be saved. +The `path` specifies the parent directory to which data will be saved. The `partition_cols` are the column names by which the dataset will be partitioned. Columns are partitioned in the order they are given. The partition splits are determined by the unique values in the partition columns. @@ -4870,6 +4797,17 @@ The above example creates a partitioned dataset that may look like: except OSError: pass +.. _io.orc: + +ORC +--- + +.. versionadded:: 1.0.0 + +Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization +for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the +ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. + .. _io.sql: SQL queries @@ -4896,7 +4834,6 @@ See also some :ref:`cookbook examples ` for some advanced strategi The key functions are: .. autosummary:: - :toctree: ../reference/api/ read_sql_table read_sql_query @@ -5586,7 +5523,7 @@ Performance considerations -------------------------- This is an informal comparison of various IO methods, using pandas -0.20.3. Timings are machine dependent and small differences should be +0.24.2. Timings are machine dependent and small differences should be ignored. .. code-block:: ipython @@ -5607,11 +5544,18 @@ Given the next test set: .. code-block:: python - from numpy.random import randn + + + import numpy as np + + import os sz = 1000000 - df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + sz = 1000000 + np.random.seed(42) + df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) def test_sql_write(df): if os.path.exists('test.sql'): @@ -5620,151 +5564,149 @@ Given the next test set: df.to_sql(name='test_table', con=sql_db) sql_db.close() - def test_sql_read(): sql_db = sqlite3.connect('test.sql') pd.read_sql_query("select * from test_table", sql_db) sql_db.close() - def test_hdf_fixed_write(df): df.to_hdf('test_fixed.hdf', 'test', mode='w') - def test_hdf_fixed_read(): pd.read_hdf('test_fixed.hdf', 'test') - def test_hdf_fixed_write_compress(df): df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') - def test_hdf_fixed_read_compress(): pd.read_hdf('test_fixed_compress.hdf', 'test') - def test_hdf_table_write(df): df.to_hdf('test_table.hdf', 'test', mode='w', format='table') - def test_hdf_table_read(): pd.read_hdf('test_table.hdf', 'test') - def test_hdf_table_write_compress(df): df.to_hdf('test_table_compress.hdf', 'test', mode='w', complib='blosc', format='table') - def test_hdf_table_read_compress(): pd.read_hdf('test_table_compress.hdf', 'test') - def test_csv_write(df): df.to_csv('test.csv', mode='w') - def test_csv_read(): pd.read_csv('test.csv', index_col=0) - def test_feather_write(df): df.to_feather('test.feather') - def test_feather_read(): pd.read_feather('test.feather') - def test_pickle_write(df): df.to_pickle('test.pkl') - def test_pickle_read(): pd.read_pickle('test.pkl') - def test_pickle_write_compress(df): df.to_pickle('test.pkl.compress', compression='xz') - def test_pickle_read_compress(): pd.read_pickle('test.pkl.compress', compression='xz') -When writing, the top-three functions in terms of speed are are -``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. + def test_parquet_write(df): + df.to_parquet('test.parquet') + + def test_parquet_read(): + pd.read_parquet('test.parquet') + +When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. .. code-block:: ipython - In [14]: %timeit test_sql_write(df) - 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [4]: %timeit test_sql_write(df) + 3.29 s ± 43.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [5]: %timeit test_hdf_fixed_write(df) + 19.4 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [15]: %timeit test_hdf_fixed_write(df) - 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [6]: %timeit test_hdf_fixed_write_compress(df) + 19.6 ms ± 308 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [26]: %timeit test_hdf_fixed_write_compress(df) - 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [7]: %timeit test_hdf_table_write(df) + 449 ms ± 5.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [16]: %timeit test_hdf_table_write(df) - 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [8]: %timeit test_hdf_table_write_compress(df) + 448 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [27]: %timeit test_hdf_table_write_compress(df) - 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [9]: %timeit test_csv_write(df) + 3.66 s ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [17]: %timeit test_csv_write(df) - 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [10]: %timeit test_feather_write(df) + 9.75 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - In [30]: %timeit test_feather_write(df) - 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [11]: %timeit test_pickle_write(df) + 30.1 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [31]: %timeit test_pickle_write(df) - 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [12]: %timeit test_pickle_write_compress(df) + 4.29 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [32]: %timeit test_pickle_write_compress(df) - 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [13]: %timeit test_parquet_write(df) + 67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and ``test_hdf_fixed_read``. + .. code-block:: ipython - In [18]: %timeit test_sql_read() - 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [14]: %timeit test_sql_read() + 1.77 s ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [15]: %timeit test_hdf_fixed_read() + 19.4 ms ± 436 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [16]: %timeit test_hdf_fixed_read_compress() + 19.5 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [19]: %timeit test_hdf_fixed_read() - 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + In [17]: %timeit test_hdf_table_read() + 38.6 ms ± 857 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [28]: %timeit test_hdf_fixed_read_compress() - 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [18]: %timeit test_hdf_table_read_compress() + 38.8 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [20]: %timeit test_hdf_table_read() - 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [19]: %timeit test_csv_read() + 452 ms ± 9.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [29]: %timeit test_hdf_table_read_compress() - 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + In [20]: %timeit test_feather_read() + 12.4 ms ± 99.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - In [22]: %timeit test_csv_read() - 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + In [21]: %timeit test_pickle_read() + 18.4 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - In [33]: %timeit test_feather_read() - 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + In [22]: %timeit test_pickle_read_compress() + 915 ms ± 7.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - In [34]: %timeit test_pickle_read() - 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + In [23]: %timeit test_parquet_read() + 24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - In [35]: %timeit test_pickle_read_compress() - 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) +For this test case ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk. Space on disk (in bytes) .. code-block:: none - 34816000 Aug 21 18:00 test.sql - 24009240 Aug 21 18:00 test_fixed.hdf - 7919610 Aug 21 18:00 test_fixed_compress.hdf - 24458892 Aug 21 18:00 test_table.hdf - 8657116 Aug 21 18:00 test_table_compress.hdf - 28520770 Aug 21 18:00 test.csv - 16000248 Aug 21 18:00 test.feather - 16000848 Aug 21 18:00 test.pkl - 7554108 Aug 21 18:00 test.pkl.compress + 29519500 Oct 10 06:45 test.csv + 16000248 Oct 10 06:45 test.feather + 8281983 Oct 10 06:49 test.parquet + 16000857 Oct 10 06:47 test.pkl + 7552144 Oct 10 06:48 test.pkl.compress + 34816000 Oct 10 06:42 test.sql + 24009288 Oct 10 06:43 test_fixed.hdf + 24009288 Oct 10 06:43 test_fixed_compress.hdf + 24458940 Oct 10 06:44 test_table.hdf + 24458940 Oct 10 06:44 test_table_compress.hdf diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 7bedc9515abb2..8fdcd8d281a41 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -881,7 +881,7 @@ The merged result: .. note:: The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute. - Otherwise the result will coerce to ``object`` dtype. + Otherwise the result will coerce to the categories' dtype. .. note:: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 6c36a6470f841..abbb6feef6056 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -12,10 +12,10 @@ pandas. .. note:: The choice of using ``NaN`` internally to denote missing data was largely - for simplicity and performance reasons. It differs from the MaskedArray - approach of, for example, :mod:`scikits.timeseries`. We are hopeful that - NumPy will soon be able to provide a native NA type solution (similar to R) - performant enough to be used in pandas. + for simplicity and performance reasons. + Starting from pandas 1.0, some optional data types start experimenting + with a native ``NA`` scalar using a mask-based approach. See + :ref:`here ` for more. See the :ref:`cookbook` for some advanced strategies. @@ -110,7 +110,7 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. .. _missing.inserting: Inserting missing data ----------------------- +~~~~~~~~~~~~~~~~~~~~~~ You can insert missing values by simply assigning to containers. The actual missing value used will be chosen based on the dtype. @@ -135,9 +135,10 @@ For object containers, pandas will use the value given: s.loc[1] = np.nan s +.. _missing_data.calculations: Calculations with missing data ------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Missing values propagate naturally through arithmetic operations between pandas objects. @@ -189,7 +190,7 @@ The sum of an empty or all-NA Series or column of a DataFrame is 0. pd.Series([np.nan]).sum() - pd.Series([]).sum() + pd.Series([], dtype="float64").sum() The product of an empty or all-NA Series or column of a DataFrame is 1. @@ -197,7 +198,7 @@ The product of an empty or all-NA Series or column of a DataFrame is 1. pd.Series([np.nan]).prod() - pd.Series([]).prod() + pd.Series([], dtype="float64").prod() NA values in GroupBy @@ -771,3 +772,177 @@ the ``dtype="Int64"``. s See :ref:`integer_na` for more. + + +.. _missing_data.NA: + +Experimental ``NA`` scalar to denote missing values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + Experimental: the behaviour of ``pd.NA`` can still change without warning. + +.. versionadded:: 1.0.0 + +Starting from pandas 1.0, an experimental ``pd.NA`` value (singleton) is +available to represent scalar missing values. At this moment, it is used in +the nullable :doc:`integer `, boolean and +:ref:`dedicated string ` data types as the missing value indicator. + +The goal of ``pd.NA`` is provide a "missing" indicator that can be used +consistently across data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` +depending on the data type). + +For example, when having missing values in a Series with the nullable integer +dtype, it will use ``pd.NA``: + +.. ipython:: python + + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + s[2] is pd.NA + +Currently, pandas does not yet use those data types by default (when creating +a DataFrame or Series, or when reading in data), so you need to specify +the dtype explicitly. + +Propagation in arithmetic and comparison operations +--------------------------------------------------- + +In general, missing values *propagate* in operations involving ``pd.NA``. When +one of the operands is unknown, the outcome of the operation is also unknown. + +For example, ``pd.NA`` propagates in arithmetic operations, similarly to +``np.nan``: + +.. ipython:: python + + pd.NA + 1 + "a" * pd.NA + +There are a few special cases when the result is known, even when one of the +operands is ``NA``. + + +================ ====== +Operation Result +================ ====== +``pd.NA ** 0`` 0 +``1 ** pd.NA`` 1 +``-1 ** pd.NA`` -1 +================ ====== + +In equality and comparison operations, ``pd.NA`` also propagates. This deviates +from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always +return ``False``. + +.. ipython:: python + + pd.NA == 1 + pd.NA == pd.NA + pd.NA < 2.5 + +To check if a value is equal to ``pd.NA``, the :func:`isna` function can be +used: + +.. ipython:: python + + pd.isna(pd.NA) + +An exception on this basic propagation rule are *reductions* (such as the +mean or the minimum), where pandas defaults to skipping missing values. See +:ref:`above ` for more. + +Logical operations +------------------ + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*, similarly to R, SQL and Julia). This logic means to only +propagate missing values when it is logically required. + +For example, for the logical "or" operation (``|``), if one of the operands +is ``True``, we already know the result will be ``True``, regardless of the +other value (so regardless the missing value would be ``True`` or ``False``). +In this case, ``pd.NA`` does not propagate: + +.. ipython:: python + + True | False + True | pd.NA + pd.NA | True + +On the other hand, if one of the operands is ``False``, the result depends +on the value of the other operand. Therefore, in this case ``pd.NA`` +propagates: + +.. ipython:: python + + False | True + False | False + False | pd.NA + +The behaviour of the logical "and" operation (``&``) can be derived using +similar logic (where now ``pd.NA`` will not propagate if one of the operands +is already ``False``): + +.. ipython:: python + + False & True + False & False + False & pd.NA + +.. ipython:: python + + True & True + True & False + True & pd.NA + + +``NA`` in a boolean context +--------------------------- + +Since the actual value of an NA is unknown, it is ambiguous to convert NA +to a boolean value. The following raises an error: + +.. ipython:: python + :okexcept: + + bool(pd.NA) + +This also means that ``pd.NA`` cannot be used in a context where it is +evaluated to a boolean, such as ``if condition: ...`` where ``condition`` can +potentially be ``pd.NA``. In such cases, :func:`isna` can be used to check +for ``pd.NA`` or ``condition`` being ``pd.NA`` can be avoided, for example by +filling missing values beforehand. + +A similar situation occurs when using Series or DataFrame objects in ``if`` +statements, see :ref:`gotchas.truth`. + +NumPy ufuncs +------------ + +:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs +work with ``NA``, and generally return ``NA``: + +.. ipython:: python + + np.log(pd.NA) + np.add(pd.NA, 1) + +.. warning:: + + Currently, ufuncs involving an ndarray and ``NA`` will return an + object-dtype filled with NA values. + + .. ipython:: python + + a = np.array([1, 2, 3]) + np.greater(a, pd.NA) + + The return type here may change to return a different array type + in the future. + +See :ref:`dsintro.numpy_interop` for more on ufuncs. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8583a9312b690..b28354cd8b5f2 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -14,7 +14,7 @@ Reshaping by pivoting DataFrame objects .. ipython:: python :suppress: - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 def unpivot(frame): @@ -38,7 +38,7 @@ For the curious here is how the above ``DataFrame`` was created: .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 7b590a3a1fcc8..43bb4966ec5bf 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -26,7 +26,7 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on .. ipython:: python :suppress: - from pandas.util.testing import _make_timeseries + from pandas._testing import _make_timeseries # Make a random in-memory dataset ts = _make_timeseries(freq="30S", seed=0) @@ -93,9 +93,9 @@ Use efficient datatypes ----------------------- The default pandas data types are not the most memory efficient. This is -especially true for high-cardinality text data (columns with relatively few -unique values). By using more efficient data types you can store larger datasets -in memory. +especially true for text data columns with relatively few unique values (commonly +referred to as "low-cardinality" data). By using more efficient data types, you +can store larger datasets in memory. .. ipython:: python @@ -358,6 +358,7 @@ results will fit in memory, so we can safely call ``compute`` without running out of memory. At that point it's just a regular pandas object. .. ipython:: python + :okwarning: @savefig dask_resample.png ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index c258a8840b714..8588fac4a18d0 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -15,7 +15,7 @@ can be chosen, including 0) is omitted. The compressed values are not actually s arr = np.random.randn(10) arr[2:-2] = np.nan - ts = pd.Series(pd.SparseArray(arr)) + ts = pd.Series(pd.arrays.SparseArray(arr)) ts Notice the dtype, ``Sparse[float64, nan]``. The ``nan`` means that elements in the @@ -51,7 +51,7 @@ identical to their dense counterparts. SparseArray ----------- -:class:`SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` +:class:`arrays.SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` for storing an array of sparse values (see :ref:`basics.dtypes` for more on extension arrays). It is a 1-dimensional ndarray-like object storing only values distinct from the ``fill_value``: @@ -61,7 +61,7 @@ only values distinct from the ``fill_value``: arr = np.random.randn(10) arr[2:5] = np.nan arr[7:8] = np.nan - sparr = pd.SparseArray(arr) + sparr = pd.arrays.SparseArray(arr) sparr A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.asarray` @@ -144,7 +144,7 @@ to ``SparseArray`` and get a ``SparseArray`` as a result. .. ipython:: python - arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + arr = pd.arrays.SparseArray([1., np.nan, np.nan, -2., np.nan]) np.abs(arr) @@ -153,7 +153,7 @@ the correct dense result. .. ipython:: python - arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + arr = pd.arrays.SparseArray([1., -1, -1, -2., -1], fill_value=-1) np.abs(arr) np.abs(arr).to_dense() @@ -194,7 +194,7 @@ From an array-like, use the regular :class:`Series` or .. ipython:: python # New way - pd.DataFrame({"A": pd.SparseArray([0, 1])}) + pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, @@ -256,10 +256,10 @@ Instead, you'll need to ensure that the values being assigned are sparse .. ipython:: python - df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) + df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) df['B'] = [0, 0] # remains dense df['B'].dtype - df['B'] = pd.SparseArray([0, 0]) + df['B'] = pd.arrays.SparseArray([0, 0]) df['B'].dtype The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 006f928c037bd..02550eab86913 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -6,7 +6,7 @@ "source": [ "# Styling\n", "\n", - "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n", + "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/user_guide/style.ipynb).\n", "\n", "You can apply **conditional formatting**, the visual styling of a DataFrame\n", "depending on the data within, by using the ``DataFrame.style`` property.\n", @@ -67,6 +67,7 @@ "df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n", "df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n", " axis=1)\n", + "df.iloc[3, 3] = np.nan\n", "df.iloc[0, 2] = np.nan" ] }, @@ -402,6 +403,38 @@ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can format the text displayed for missing values by `na_rep`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.format(\"{:.2%}\", na_rep=\"-\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These formatting techniques can be used in combination with styling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.style.highlight_max().format(None, na_rep=\"-\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -644,7 +677,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice that you're able share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon." + "Notice that you're able to share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon." ] }, { @@ -659,6 +692,7 @@ "- precision\n", "- captions\n", "- table-wide styles\n", + "- missing values representation\n", "- hiding the index or columns\n", "\n", "Each of these can be specified in two ways:\n", @@ -800,6 +834,32 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can control the default missing values representation for the entire table through `set_na_rep` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df.style\n", + " .set_na_rep(\"FAIL\")\n", + " .format(None, na_rep=\"PASS\", subset=[\"D\"])\n", + " .highlight_null(\"yellow\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1003,7 +1063,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](http://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d521c745ccfe5..88c86ac212f11 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -13,7 +13,7 @@ Text Data Types .. versionadded:: 1.0.0 -There are two main ways to store text data +There are two ways to store text data in pandas: 1. ``object`` -dtype NumPy array. 2. :class:`StringDtype` extension type. @@ -63,7 +63,52 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") -Everything that follows in the rest of this document applies equally to +.. _text.differences: + +Behavior differences +^^^^^^^^^^^^^^^^^^^^ + +These are places where the behavior of ``StringDtype`` objects differ from +``object`` dtype + +l. For ``StringDtype``, :ref:`string accessor methods` + that return **numeric** output will always return a nullable integer dtype, + rather than either int or float dtype, depending on the presence of NA values. + Methods returning **boolean** output will return a nullable boolean dtype. + + .. ipython:: python + + s = pd.Series(["a", None, "b"], dtype="string") + s + s.str.count("a") + s.dropna().str.count("a") + + Both outputs are ``Int64`` dtype. Compare that with object-dtype + + .. ipython:: python + + s2 = pd.Series(["a", None, "b"], dtype="object") + s2.str.count("a") + s2.dropna().str.count("a") + + When NA values are present, the output dtype is float64. Similarly for + methods returning boolean values. + + .. ipython:: python + + s.str.isdigit() + s.str.match("a") + +2. Some string methods, like :meth:`Series.str.decode` are not available + on ``StringArray`` because ``StringArray`` only holds strings, not + bytes. +3. In comparison operations, :class:`arrays.StringArray` and ``Series`` backed + by a ``StringArray`` will return an object with :class:`BooleanDtype`, + rather than a ``bool`` dtype object. Missing values in a ``StringArray`` + will propagate in comparison operations, rather than always comparing + unequal like :attr:`numpy.nan`. + +Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. .. _text.string_methods: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 17b02374050d2..08b2ae0a4a837 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -772,7 +772,6 @@ There are several time/date properties that one can access from ``Timestamp`` or week,"The week ordinal of the year" dayofweek,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" - weekday_name,"The name of the day in a week (ex: Friday)" quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." days_in_month,"The number of days in the month of the datetime" is_month_start,"Logical indicating if first day of month (defined by frequency)" @@ -1591,10 +1590,10 @@ labels. s = pd.date_range('2000-01-01', '2000-01-05').to_series() s.iloc[2] = pd.NaT - s.dt.weekday_name + s.dt.day_name() # default: label='left', closed='left' - s.resample('B').last().dt.weekday_name + s.resample('B').last().dt.day_name() Notice how the value for Sunday got pulled back to the previous Friday. To get the behavior where the value for Sunday is pushed to Monday, use @@ -1602,7 +1601,7 @@ labels. .. ipython:: python - s.resample('B', label='right', closed='right').last().dt.weekday_name + s.resample('B', label='right', closed='right').last().dt.day_name() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the specified axis for a ``DataFrame``. diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 0a74d67486715..823e177f3e05e 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -236,7 +236,7 @@ I/O enhancements .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) @@ -417,6 +417,7 @@ Bug fixes original ``Series`` or ``NaN``. For example, .. ipython:: python + :okwarning: strs = 'go', 'bow', 'joe', 'slow' ds = pd.Series(strs) diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index ab48594ddadab..43c6083fdce8f 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -828,8 +828,7 @@ Experimental Since this is an EXPERIMENTAL LIBRARY, the storage format may not be stable until a future release. - .. ipython:: python - :okwarning: + .. code-block:: python df = pd.DataFrame(np.random.rand(5, 2), columns=list('AB')) df.to_msgpack('foo.msg') @@ -841,8 +840,7 @@ Experimental You can pass ``iterator=True`` to iterator over the unpacked results - .. ipython:: python - :okwarning: + .. code-block:: python for o in pd.read_msgpack('foo.msg', iterator=True): print(o) diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 6242c40d44bf8..4f9ab761334e7 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -224,7 +224,7 @@ Enhancements .. code-block:: ipython - In [28]: import pandas.util.testing as tm + In [28]: import pandas._testing as tm In [29]: panel = tm.makePanel(5) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index c27ada6ef3b58..95e354e425143 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -312,14 +312,13 @@ Timezone handling improvements previously this resulted in ``Exception`` or ``TypeError`` (:issue:`7812`) .. ipython:: python - :okwarning: ts = pd.Timestamp('2014-08-01 09:00', tz='US/Eastern') ts ts.tz_localize(None) - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', - periods=10, tz='US/Eastern') + didx = pd.date_range(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) @@ -853,7 +852,7 @@ Other notable API changes: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead - See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy - ``merge``, ``DataFrame.merge``, and ``ordered_merge`` now return the same type as the ``left`` argument (:issue:`7737`). diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index b58eabaed6127..292351c709940 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -172,7 +172,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index fc638e35ed88b..855d0b8695bb1 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -528,7 +528,7 @@ Deprecations `seaborn `_ for similar but more refined functionality (:issue:`3445`). The documentation includes some examples how to convert your existing code - from ``rplot`` to seaborn `here `__. + from ``rplot`` to seaborn `here `__. - The ``pandas.sandbox.qtpandas`` interface is deprecated and will be removed in a future version. We refer users to the external package `pandas-qt `_. (:issue:`9615`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index a7174c6325f86..d3f96d4185d65 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1279,7 +1279,7 @@ Bug Fixes - Removed ``millisecond`` property of ``DatetimeIndex``. This would always raise a ``ValueError`` (:issue:`12019`). - Bug in ``Series`` constructor with read-only data (:issue:`11502`) -- Removed ``pandas.util.testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) +- Removed ``pandas._testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) - Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`) - Bug in ``.style`` indexes and MultiIndexes not appearing (:issue:`11655`) - Bug in ``to_msgpack`` and ``from_msgpack`` which did not correctly serialize or deserialize ``NaT`` (:issue:`12307`). diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 61a65415f6b57..6eb509a258430 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -707,6 +707,7 @@ A ``Series`` will now correctly promote its dtype for assignment with incompat v .. ipython:: python + :okwarning: s = pd.Series() @@ -1224,6 +1225,7 @@ Previously, sparse data were ``float64`` dtype by default, even if all inputs we As of v0.19.0, sparse data keeps the input dtype, and uses more appropriate ``fill_value`` defaults (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). .. ipython:: python + :okwarning: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) pd.SparseArray([True, False, False, False]) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index c7278d5a47ba6..ceb1c7f27231b 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -33,7 +33,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations .. note:: - This is a combined release for 0.20.0 and and 0.20.1. + This is a combined release for 0.20.0 and 0.20.1. Version 0.20.1 contains one additional change for backwards-compatibility with downstream projects using pandas' ``utils`` routines. (:issue:`16250`) .. contents:: What's new in v0.20.0 @@ -1360,7 +1360,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [133]: import pandas.util.testing as tm + In [133]: import pandas._testing as tm In [134]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 34b610e8af0b3..71969c4de6b02 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -20,7 +20,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). @@ -390,7 +390,7 @@ Sum/Prod of all-NaN or empty Series/DataFrames is now consistently NaN The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on -whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`). +whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`). Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. @@ -428,6 +428,7 @@ Note that this also changes the sum of an empty ``Series``. Previously this alwa but for consistency with the all-NaN case, this was changed to return NaN as well: .. ipython:: python + :okwarning: pd.Series([]).sum() @@ -469,7 +470,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 @@ -926,7 +927,7 @@ Other API changes - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytables standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) -- Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) +- Removed the ``@slow`` decorator from ``pandas._testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index ea36b35d61740..75949a90d09a6 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -55,6 +55,7 @@ The default sum for empty or all-*NA* ``Series`` is now ``0``. *pandas 0.22.0* .. ipython:: python + :okwarning: pd.Series([]).sum() pd.Series([np.nan]).sum() @@ -67,6 +68,7 @@ pandas 0.20.3 without bottleneck, or pandas 0.21.x), use the ``min_count`` keyword. .. ipython:: python + :okwarning: pd.Series([]).sum(min_count=1) @@ -85,6 +87,7 @@ required for a non-NA sum or product. returning ``1`` instead. .. ipython:: python + :okwarning: pd.Series([]).prod() pd.Series([np.nan]).prod() diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f4c283ea742f7..b9e1b5060d1da 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -648,7 +648,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [75]: import pandas.util.testing as tm + In [75]: import pandas._testing as tm In [76]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 42579becd4237..85de0150a5a28 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -353,7 +353,7 @@ Example: mi = pd.MultiIndex.from_product([list('AB'), list('CD'), list('EF')], names=['AB', 'CD', 'EF']) - df = pd.DataFrame([i for i in range(len(mi))], index=mi, columns=['N']) + df = pd.DataFrame(list(range(len(mi))), index=mi, columns=['N']) df df.rename_axis(index={'CD': 'New'}) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index be137eaabd40a..b18d022349001 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -170,7 +170,7 @@ which level to end normalization (:issue:`23843`): The repr now looks like this: -.. ipython:: python +.. code-block:: ipython from pandas.io.json import json_normalize data = [{ @@ -354,6 +354,7 @@ When passed DataFrames whose values are sparse, :func:`concat` will now return a :class:`Series` or :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (:issue:`25702`). .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) @@ -910,6 +911,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. **New way** .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 0, 1, 2])}) df.dtypes diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 2e9524fea89b1..944021ca0fcae 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -9,7 +9,7 @@ including other versions of pandas. I/O and LZMA ~~~~~~~~~~~~ -Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). +Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue:`27575`). Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 664325ac063c0..072d1bae2a2b9 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -156,8 +156,7 @@ Other new features New plotting methods ~~~~~~~~~~~~~~~~~~~~ -.. ipython:: python - :suppress: +.. code-block:: python import pandas as pd fx = pd.read_pickle('data/fx_prices') @@ -165,7 +164,7 @@ New plotting methods ``Series.plot`` now supports a ``secondary_y`` option: -.. ipython:: python +.. code-block:: python plt.figure() diff --git a/doc/source/whatsnew/v0.8.1.rst b/doc/source/whatsnew/v0.8.1.rst index aaf1778bf637d..1e6b9746c85a5 100644 --- a/doc/source/whatsnew/v0.8.1.rst +++ b/doc/source/whatsnew/v0.8.1.rst @@ -29,7 +29,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved implementation of rolling min and max (thanks to `Bottleneck - `__ !) + `__ !) - Add accelerated ``'median'`` GroupBy option (:issue:`1358`) - Significantly improve the performance of parsing ISO8601-format date strings with ``DatetimeIndex`` or ``to_datetime`` (:issue:`1571`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst old mode 100644 new mode 100755 index fa1669b1f3343..5f79accc5c679 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1,54 +1,86 @@ -.. _whatsnew_1000: +.. _whatsnew_100: What's new in 1.0.0 (??) ------------------------ -New Deprecation Policy -~~~~~~~~~~~~~~~~~~~~~~ +These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog +including other versions of pandas. -Starting with Pandas 1.0.0, pandas will adopt a version of `SemVer`_. +.. note:: -Historically, pandas has used a "rolling" deprecation policy, with occasional -outright breaking API changes. Where possible, we would deprecate the behavior -we'd like to change, giving an option to adopt the new behavior (via a keyword -or an alternative method), and issuing a warning for users of the old behavior. -Sometimes, a deprecation was not possible, and we would make an outright API -breaking change. + The pandas 1.0 release removed a lot of functionality that was deprecated + in previous releases (see :ref:`below ` + for an overview). It is recommended to first upgrade to pandas 0.25 and to + ensure your code is working without warnings, before upgrading to pandas + 1.0. -We'll continue to *introduce* deprecations in major and minor releases (e.g. -1.0.0, 1.1.0, ...). Those deprecations will be *enforced* in the next major -release. -Note that *behavior changes* and *API breaking changes* are not identical. API -breaking changes will only be released in major versions. If we consider a -behavior to be a bug, and fixing that bug induces a behavior change, we'll -release that change in a minor release. This is a sometimes difficult judgment -call that we'll do our best on. +New Deprecation Policy +~~~~~~~~~~~~~~~~~~~~~~ -This doesn't mean that pandas' pace of development will slow down. In the `2019 -Pandas User Survey`_, about 95% of the respondents said they considered pandas -"stable enough". This indicates there's an appetite for new features, even if it -comes at the cost of break API. The difference is that now API breaking changes -will be accompanied with a bump in the major version number (e.g. pandas 1.5.1 --> 2.0.0). +Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +version releases. Briefly, + +* Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) +* Deprecations will be enforced in major releases (e.g. 1.0.0, 2.0.0, 3.0.0, ...) +* API-breaking changes will be made only in major releases (except for experimental features) See :ref:`policies.version` for more. .. _2019 Pandas User Survey: http://dev.pandas.io/pandas-blog/2019-pandas-user-survey.html .. _SemVer: https://semver.org +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_100.NA: + +Experimental ``NA`` scalar to denote missing values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new ``pd.NA`` value (singleton) is introduced to represent scalar missing +values. Up to now, pandas used several values to represent missing data: ``np.nan`` is used for this for float data, ``np.nan`` or +``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The +goal of ``pd.NA`` is to provide a "missing" indicator that can be used +consistently across data types. ``pd.NA`` is currently used by the nullable integer and boolean +data types and the new string data type (:issue:`28095`). + .. warning:: - The minimum supported Python version will be bumped to 3.6 in a future release. + Experimental: the behaviour of ``pd.NA`` can still change without warning. -{{ header }} +For example, creating a Series using the nullable integer dtype: -These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog -including other versions of pandas. +.. ipython:: python + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + +Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. +In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" +or "unknown" in comparison operations: + +.. ipython:: python + + np.nan > 1 + pd.NA > 1 + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*). For example: + +.. ipython:: python + + pd.NA | True + +For more, see :ref:`NA section ` in the user guide on missing +data. -Enhancements -~~~~~~~~~~~~ .. _whatsnew_100.string: @@ -56,14 +88,14 @@ Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`StringDtype`, an extension type dedicated to string data. -Previously, strings were typically stored in object-dtype NumPy arrays. +Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`) .. warning:: ``StringDtype`` is currently considered experimental. The implementation and parts of the API may change without warning. -The text extension type solves several issues with object-dtype NumPy arrays: +The ``'string'`` extension type solves several issues with object-dtype NumPy arrays: 1. You can accidentally store a *mixture* of strings and non-strings in an ``object`` dtype array. A ``StringArray`` can only store strings. @@ -88,18 +120,84 @@ You can use the alias ``"string"`` as well. The usual string accessor methods work. Where appropriate, the return type of the Series or columns of a DataFrame will also have string dtype. +.. ipython:: python + s.str.upper() s.str.split('b', expand=True).dtypes +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. -.. _whatsnew_1000.enhancements.other: +.. _whatsnew_100.boolean: + +Boolean data type with missing values support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`BooleanDtype` / :class:`~arrays.BooleanArray`, an extension +type dedicated to boolean data that can hold missing values. The default +``bool`` data type based on a bool-dtype NumPy array, the column can only hold +``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray` +can store missing values as well by keeping track of this in a separate mask. +(:issue:`29555`, :issue:`30095`) + +.. ipython:: python + + pd.Series([True, False, None], dtype=pd.BooleanDtype()) + +You can use the alias ``"boolean"`` as well. + +.. ipython:: python + + s = pd.Series([True, False, None], dtype="boolean") + s + +.. _whatsnew_100.numba_rolling_apply: + +Using Numba in ``rolling.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the +routine using `Numba `__ instead of Cython. Using the Numba engine +can yield significant performance gains if the apply function can operate on numpy arrays and +the data set is larger (1 million rows or greater). For more details, see +:ref:`rolling apply documentation ` (:issue:`28987`) + +.. _whatsnew_100.custom_window: + +Defining custom windows for rolling operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added a :func:`pandas.api.indexers.BaseIndexer` class that allows users to define how +window bounds are created during ``rolling`` operations. Users can define their own ``get_window_bounds`` +method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate the start and end +indices used for each window during the rolling aggregation. For more details and example usage, see +the :ref:`custom window rolling documentation ` + +.. _whatsnew_100.to_markdown: + +Converting to Markdown +^^^^^^^^^^^^^^^^^^^^^^ + +We've added :meth:`~DataFrame.to_markdown` for creating a markdown table (:issue:`11052`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=['a', 'a', 'b']) + print(df.to_markdown()) + +.. _whatsnew_100.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the @@ -110,8 +208,29 @@ Other enhancements (depending on the presence of missing data) or object dtype column. (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) +- Implemented :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` functions (:issue:`26597`) - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) +- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) +- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) +- Roundtripping DataFrames with nullable integer, string and period data types to parquet + (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine + now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). +- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) +- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) +- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) +- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) +- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) +- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) +- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) +- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`) +- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) +- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) +- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) + Build Changes ^^^^^^^^^^^^^ @@ -121,43 +240,50 @@ cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :i a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. -.. _whatsnew_1000.api_breaking: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_1000.api_breaking.MultiIndex._names: +.. _whatsnew_100.api_breaking.MultiIndex._names: -``MultiIndex.levels`` do not hold level names any longer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Avoid using names from ``MultiIndex.levels`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- A :class:`MultiIndex` previously stored the level names as attributes of each of its - :attr:`MultiIndex.levels`. From Pandas 1.0, the names are only accessed through - :attr:`MultiIndex.names` (which was also possible previously). This is done in order to - make :attr:`MultiIndex.levels` more similar to :attr:`CategoricalIndex.categories` (:issue:`27242`:). +As part of a larger refactor to :class:`MultiIndex` the level names are now +stored separately from the levels (:issue:`27242`). We recommend using +:attr:`MultiIndex.names` to access the names, and :meth:`Index.set_names` +to update the names. -*pandas 0.25.x* +For backwards compatibility, you can still *access* the names via the levels. -.. code-block:: ipython +.. ipython:: python - In [1]: mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - Out[2]: mi - MultiIndex([(1, 'a'), - (1, 'b'), - (2, 'a'), - (2, 'b')], - names=['x', 'y']) - Out[3]: mi.levels[0].name - 'x' + mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi.levels[0].name -*pandas 1.0.0* +However, it is no longer possible to *update* the names of the ``MultiIndex`` +via the level. .. ipython:: python + :okexcept: - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) - mi.levels[0].name + mi.levels[0].name = "new name" + mi.names -- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) +To update, use ``MultiIndex.set_names``, which returns a new ``MultiIndex``. + +.. ipython:: python + + mi2 = mi.set_names("new name", level=0) + mi2.names + +New repr for :class:`~pandas.arrays.IntervalArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :class:`pandas.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) *pandas 0.25.x* @@ -169,19 +295,365 @@ Backwards incompatible API changes closed='right', dtype='interval[int64]') - *pandas 1.0.0* .. ipython:: python pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) -.. _whatsnew_1000.api.other: +``DataFrame.rename`` now only accepts one positional argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :meth:`DataFrame.rename` would previously accept positional arguments that would lead + to ambiguous or undefined behavior. From pandas 1.0, only the very first argument, which + maps labels to their new names along the default axis, is allowed to be passed by position + (:issue:`29136`). + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + FutureWarning: ...Use named arguments to resolve ambiguity... + Out[2]: + 2 + 1 1 + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + df.rename({0: 1}, {0: 2}) + +Note that errors will now be raised when conflicting or potentially ambiguous arguments are provided. + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df.rename({0: 1}, index={0: 2}) + Out[1]: + 0 + 1 1 + + In [2]: df.rename(mapper={0: 1}, index={0: 2}) + Out[2]: + 0 + 2 1 + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + df.rename({0: 1}, index={0: 2}) + df.rename(mapper={0: 1}, index={0: 2}) + +You can still change the axis along which the first positional argument is applied by +supplying the ``axis`` keyword argument. + +.. ipython:: python + + df.rename({0: 1}) + df.rename({0: 1}, axis=1) + +If you would like to update both the index and column labels, be sure to use the respective +keywords. + +.. ipython:: python + + df.rename(index={0: 1}, columns={0: 2}) + +Extended verbose info output for :class:`~pandas.DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :meth:`DataFrame.info` now shows line numbers for the columns summary (:issue:`17304`) + +*pandas 0.25.x* + +.. code-block:: python + + >>> df = pd.DataFrame({"int_col": [1, 2, 3], + ... "text_col": ["a", "b", "c"], + ... "float_col": [0.0, 0.1, 0.2]}) + >>> df.info(verbose=True) + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 3 columns): + int_col 3 non-null int64 + text_col 3 non-null object + float_col 3 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 152.0+ bytes + +*pandas 1.0.0* + +.. ipython:: python + + df = pd.DataFrame({"int_col": [1, 2, 3], + "text_col": ["a", "b", "c"], + "float_col": [0.0, 0.1, 0.2]}) + df.info(verbose=True) + +:meth:`pandas.array` inference changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`): + +1. String data (including missing values) now returns a :class:`arrays.StringArray`. +2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. +3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray` + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.array(["a", None]) + + ['a', None] + Length: 2, dtype: object + + >>> pd.array([1, None]) + + [1, None] + Length: 2, dtype: object + + +*pandas 1.0.0* + +.. ipython:: python + + pd.array(["a", None]) + pd.array([1, None]) + +As a reminder, you can specify the ``dtype`` to disable all inference. + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a + a[2] + +This has a few API-breaking consequences. + +**Converting to a NumPy ndarray** + +When converting to a NumPy array missing values will be ``pd.NA``, which cannot +be converted to a float. So calling ``np.asarray(integer_array, dtype="float")`` +will now raise. + +*pandas 0.25.x* + +.. code-block:: python + + >>> np.asarray(a, dtype="float") + array([ 1., 2., nan]) + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + np.asarray(a, dtype="float") + +Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. + +.. ipython:: python + + a.to_numpy(dtype="float", na_value=np.nan) + +**value_counts returns a nullable integer dtype** + +:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable +integer dtype for the values. + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + dtype('int64') + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + +By default :meth:`Categorical.min` now returns the minimum instead of np.nan +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When :class:`Categorical` contains ``np.nan``, +:meth:`Categorical.min` no longer return ``np.nan`` by default (skipna=True) (:issue:`25303`) + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: pd.Categorical([1, 2, np.nan], ordered=True).min() + Out[1]: nan + + +*pandas 1.0.0* + +.. ipython:: python + + pd.Categorical([1, 2, np.nan], ordered=True).min() + + +Default dtype of empty :class:`pandas.Series` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a `DeprecationWarning` now +(:issue:`17261`). The default dtype will change from ``float64`` to ``object`` in future releases so that it is +consistent with the behaviour of :class:`DataFrame` and :class:`Index`. + +*pandas 1.0.0* + +.. code-block:: ipython + + In [1]: pd.Series() + Out[2]: + DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. + Series([], dtype: float64) + +.. _whatsnew_100.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). + +.. _whatsnew_100.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`29766`, :issue:`29723`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| numpy | 1.13.3 | X | | ++-----------------+-----------------+----------+---------+ +| pytz | 2015.4 | X | | ++-----------------+-----------------+----------+---------+ +| python-dateutil | 2.6.1 | X | | ++-----------------+-----------------+----------+---------+ +| bottleneck | 1.2.1 | | | ++-----------------+-----------------+----------+---------+ +| numexpr | 2.6.2 | | | ++-----------------+-----------------+----------+---------+ +| pytest (dev) | 4.0.2 | | | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| beautifulsoup4 | 4.6.0 | | ++-----------------+-----------------+---------+ +| fastparquet | 0.3.2 | X | ++-----------------+-----------------+---------+ +| gcsfs | 0.2.2 | | ++-----------------+-----------------+---------+ +| lxml | 3.8.0 | | ++-----------------+-----------------+---------+ +| matplotlib | 2.2.2 | | ++-----------------+-----------------+---------+ +| numba | 0.46.0 | X | ++-----------------+-----------------+---------+ +| openpyxl | 2.5.7 | X | ++-----------------+-----------------+---------+ +| pyarrow | 0.13.0 | X | ++-----------------+-----------------+---------+ +| pymysql | 0.7.1 | | ++-----------------+-----------------+---------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+---------+ +| s3fs | 0.3.0 | X | ++-----------------+-----------------+---------+ +| scipy | 0.19.0 | | ++-----------------+-----------------+---------+ +| sqlalchemy | 1.1.4 | | ++-----------------+-----------------+---------+ +| xarray | 0.8.2 | | ++-----------------+-----------------+---------+ +| xlrd | 1.1.0 | | ++-----------------+-----------------+---------+ +| xlsxwriter | 0.9.8 | | ++-----------------+-----------------+---------+ +| xlwt | 1.2.0 | | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_100.api.other: Other API changes ^^^^^^^^^^^^^^^^^ -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) +- Bumped the minimum supported version of ``s3fs`` from 0.0.8 to 0.3.0 (:issue:`28616`) +- :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). @@ -190,49 +662,103 @@ Other API changes - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). Now, pandas custom formatters will only be applied to plots created by pandas, through :meth:`~DataFrame.plot`. Previously, pandas' formatters would be applied to all plots created *after* a :meth:`~DataFrame.plot`. - See :ref:`units registration ` for more. -- + See :ref:`units registration ` for more. +- :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. + Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) +- When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) +- :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- Added ```` to the list of default NA values for :meth:`read_csv` (:issue:`30821`) -.. _whatsnew_1000.api.documentation: +.. _whatsnew_100.api.documentation: Documentation Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). -- Added sub-section Query MultiIndex in IO tools user guide (:issue:`28791`) +- Added sub-section on :ref:`io.query_multi` for HDF5 datasets (:issue:`28791`). -.. _whatsnew_1000.deprecations: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.deprecations: Deprecations ~~~~~~~~~~~~ +- :meth:`Series.item` and :meth:`Index.item` have been _undeprecated_ (:issue:`29250`) - ``Index.set_value`` has been deprecated. For a given index ``idx``, array ``arr``, value in ``idx`` of ``idx_val`` and a new value of ``val``, ``idx.set_value(arr, idx_val, val)`` is equivalent to ``arr[idx.get_loc(idx_val)] = val``, which should be used instead (:issue:`28621`). -- +- :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`) +- :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`) +- :meth:`DateOffset.isAnchored` and :meth:`DatetOffset.onOffset` are deprecated and will be removed in a future version, use :meth:`DateOffset.is_anchored` and :meth:`DateOffset.is_on_offset` instead (:issue:`30340`) +- ``pandas.tseries.frequencies.get_offset`` is deprecated and will be removed in a future version, use ``pandas.tseries.frequencies.to_offset`` instead (:issue:`4205`) +- :meth:`Categorical.take_nd` and :meth:`CategoricalIndex.take_nd` are deprecated, use :meth:`Categorical.take` and :meth:`CategoricalIndex.take` instead (:issue:`27745`) +- The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`) +- The parameter ``label`` in :func:`lreshape` has been deprecated and will be removed in a future version (:issue:`29742`) +- ``pandas.core.index`` has been deprecated and will be removed in a future version, the public classes are available in the top-level namespace (:issue:`19711`) +- :func:`pandas.json_normalize` is now exposed in the top-level namespace. + Usage of ``json_normalize`` as ``pandas.io.json.json_normalize`` is now deprecated and + it is recommended to use ``json_normalize`` as :func:`pandas.json_normalize` instead (:issue:`27586`). +- The ``numpy`` argument of :meth:`pandas.read_json` is deprecated (:issue:`28512`). +- :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_feather`, and :meth:`DataFrame.to_parquet` argument "fname" is deprecated, use "path" instead (:issue:`23574`) +- The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) +- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). +- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) +- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) + +**Selecting Columns from a Grouped DataFrame** + +When selecting columns from a :class:`DataFrameGroupBy` object, passing individual keys (or a tuple of keys) inside single brackets is deprecated, +a list of items should be used instead. (:issue:`23566`) For example: + +.. code-block:: ipython + + df = pd.DataFrame({ + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": np.random.randn(8), + "C": np.random.randn(8), + }) + g = df.groupby('A') + + # single key, returns SeriesGroupBy + g['B'] + + # tuple of single key, returns SeriesGroupBy + g[('B',)] + + # tuple of multiple keys, returns DataFrameGroupBy, raises FutureWarning + g[('B', 'C')] -.. _whatsnew_1000.prior_deprecations: + # multiple keys passed directly, returns DataFrameGroupBy, raises FutureWarning + # (implicitly converts the passed strings into a single tuple) + g['B', 'C'] + # proper way, returns DataFrameGroupBy + g[['B', 'C']] -Removed SparseSeries and SparseDataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Removed SparseSeries and SparseDataFrame** ``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method have been removed (:issue:`28425`). We recommend using a ``Series`` or ``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help with migrating existing code. -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_1000.matplotlib_units: +.. _whatsnew_100.matplotlib_units: **Matplotlib unit registration** Previously, pandas would register converters with matplotlib as a side effect of importing pandas (:issue:`18720`). This changed the output of plots made via matplotlib plots after pandas was imported, even if you were using -matplotlib directly rather than rather than :meth:`~DataFrame.plot`. +matplotlib directly rather than :meth:`~DataFrame.plot`. To use pandas formatters with a matplotlib plot, specify @@ -247,31 +773,142 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. **Other removals** -- Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) -- Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) +- Removed the previously deprecated keyword "index" from :func:`read_stata`, :class:`StataReader`, and :meth:`StataReader.read`, use "index_col" instead (:issue:`17328`) +- Removed ``StataReader.data`` method, use :meth:`StataReader.read` instead (:issue:`9493`) +- Removed ``pandas.plotting._matplotlib.tsplot``, use :meth:`Series.plot` instead (:issue:`19980`) +- ``pandas.tseries.converter.register`` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) +- :meth:`Series.plot` no longer accepts positional arguments, pass keyword arguments instead (:issue:`30003`) +- :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passinig a tuple instead (:issue:`30003`) +- Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`) +- :class:`TimedeltaIndex` and :class:`DatetimeIndex` no longer accept non-nanosecond dtype strings like "timedelta64" or "datetime64", use "timedelta64[ns]" and "datetime64[ns]" instead (:issue:`24806`) +- Changed the default "skipna" argument in :func:`pandas.api.types.infer_dtype` from ``False`` to ``True`` (:issue:`24050`) +- Removed ``Series.ix`` and ``DataFrame.ix`` (:issue:`26438`) +- Removed ``Index.summary`` (:issue:`18217`) +- Removed the previously deprecated keyword "fastpath" from the :class:`Index` constructor (:issue:`23110`) +- Removed ``Series.get_value``, ``Series.set_value``, ``DataFrame.get_value``, ``DataFrame.set_value`` (:issue:`17739`) +- Removed ``Series.compound`` and ``DataFrame.compound`` (:issue:`26405`) +- Changed the default "inplace" argument in :meth:`DataFrame.set_index` and :meth:`Series.set_axis` from ``None`` to ``False`` (:issue:`27600`) +- Removed ``Series.cat.categorical``, ``Series.cat.index``, ``Series.cat.name`` (:issue:`24751`) +- Removed the previously deprecated keyword "box" from :func:`to_datetime` and :func:`to_timedelta`; in addition these now always returns :class:`DatetimeIndex`, :class:`TimedeltaIndex`, :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`24486`) +- :func:`to_timedelta`, :class:`Timedelta`, and :class:`TimedeltaIndex` no longer allow "M", "y", or "Y" for the "unit" argument (:issue:`23264`) +- Removed the previously deprecated keyword "time_rule" from (non-public) ``offsets.generate_range``, which has been moved to :func:`core.arrays._ranges.generate_range` (:issue:`24157`) +- :meth:`DataFrame.loc` or :meth:`Series.loc` with listlike indexers and missing labels will no longer reindex (:issue:`17295`) +- :meth:`DataFrame.to_excel` and :meth:`Series.to_excel` with non-existent columns will no longer reindex (:issue:`17295`) +- Removed the previously deprecated keyword "join_axes" from :func:`concat`; use ``reindex_like`` on the result instead (:issue:`22318`) +- Removed the previously deprecated keyword "by" from :meth:`DataFrame.sort_index`, use :meth:`DataFrame.sort_values` instead (:issue:`10726`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`18529`) +- Passing ``datetime64`` data to :class:`TimedeltaIndex` or ``timedelta64`` data to ``DatetimeIndex`` now raises ``TypeError`` (:issue:`23539`, :issue:`23937`) +- Passing ``int64`` values to :class:`DatetimeIndex` and a timezone now interprets the values as nanosecond timestamps in UTC, not wall times in the given timezone (:issue:`24559`) +- A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`) +- Removed ``Index.contains``, use ``key in index`` instead (:issue:`30103`) +- Addition and subtraction of ``int`` or integer-arrays is no longer allowed in :class:`Timestamp`, :class:`DatetimeIndex`, :class:`TimedeltaIndex`, use ``obj + n * obj.freq`` instead of ``obj + n`` (:issue:`22535`) +- Removed ``Series.ptp`` (:issue:`21614`) +- Removed ``Series.from_array`` (:issue:`18258`) +- Removed ``DataFrame.from_items`` (:issue:`18458`) +- Removed ``DataFrame.as_matrix``, ``Series.as_matrix`` (:issue:`18458`) +- Removed ``Series.asobject`` (:issue:`18477`) +- Removed ``DataFrame.as_blocks``, ``Series.as_blocks``, ``DataFrame.blocks``, ``Series.blocks`` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) -- Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) -- Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- :meth:`Series.where` with ``Categorical`` dtype (or :meth:`DataFrame.where` with ``Categorical`` column) no longer allows setting new categories (:issue:`24114`) +- Removed the previously deprecated keywords "start", "end", and "periods" from the :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` constructors; use :func:`date_range`, :func:`timedelta_range`, and :func:`period_range` instead (:issue:`23919`) +- Removed the previously deprecated keyword "verify_integrity" from the :class:`DatetimeIndex` and :class:`TimedeltaIndex` constructors (:issue:`23919`) +- Removed the previously deprecated keyword "fastpath" from ``pandas.core.internals.blocks.make_block`` (:issue:`19265`) +- Removed the previously deprecated keyword "dtype" from :meth:`Block.make_block_same_class` (:issue:`19434`) +- Removed ``ExtensionArray._formatting_values``. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed ``MultiIndex.to_hierarchical`` (:issue:`21613`) +- Removed ``MultiIndex.labels``, use :attr:`MultiIndex.codes` instead (:issue:`23752`) +- Removed the previously deprecated keyword "labels" from the :class:`MultiIndex` constructor, use "codes" instead (:issue:`23752`) +- Removed ``MultiIndex.set_labels``, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`) +- Removed the previously deprecated keyword "labels" from :meth:`MultiIndex.set_codes`, :meth:`MultiIndex.copy`, :meth:`MultiIndex.drop`, use "codes" instead (:issue:`23752`) +- Removed support for legacy HDF5 formats (:issue:`29787`) +- Passing a dtype alias (e.g. 'datetime64[ns, UTC]') to :class:`DatetimeTZDtype` is no longer allowed, use :meth:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`) +- Removed the previously deprecated keyword "skip_footer" from :func:`read_excel`; use "skipfooter" instead (:issue:`18836`) +- :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`) +- Removed the previously deprecated keyword "convert_datetime64" from :meth:`DataFrame.to_records` (:issue:`18902`) +- Removed ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- Changed the default "keep_tz" argument in :meth:`DatetimeIndex.to_series` from ``None`` to ``True`` (:issue:`23739`) +- Removed ``api.types.is_period`` and ``api.types.is_datetimetz`` (:issue:`23917`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) -- Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) -- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) +- Removed ``pandas.tseries.plotting.tsplot`` (:issue:`18627`) +- Removed the previously deprecated keywords "reduce" and "broadcast" from :meth:`DataFrame.apply` (:issue:`18577`) +- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas._testing`` (:issue:`29174`) +- Removed the previously deprecated ``FrozenNDArray`` class in ``pandas.core.indexes.frozen`` (:issue:`29335`) +- Removed the previously deprecated keyword "nthreads" from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) +- Removed ``Index.is_lexsorted_for_tuple`` (:issue:`29305`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`29608`) +- Removed ``Series.valid``; use :meth:`Series.dropna` instead (:issue:`18800`) +- Removed ``DataFrame.is_copy``, ``Series.is_copy`` (:issue:`18812`) +- Removed ``DataFrame.get_ftype_counts``, ``Series.get_ftype_counts`` (:issue:`18243`) +- Removed ``DataFrame.ftypes``, ``Series.ftypes``, ``Series.ftype`` (:issue:`26744`) +- Removed ``Index.get_duplicates``, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) +- Removed ``Series.clip_upper``, ``Series.clip_lower``, ``DataFrame.clip_upper``, ``DataFrame.clip_lower`` (:issue:`24203`) +- Removed the ability to alter :attr:`DatetimeIndex.freq`, :attr:`TimedeltaIndex.freq`, or :attr:`PeriodIndex.freq` (:issue:`20772`) +- Removed ``DatetimeIndex.offset`` (:issue:`20730`) +- Removed ``DatetimeIndex.asobject``, ``TimedeltaIndex.asobject``, ``PeriodIndex.asobject``, use ``astype(object)`` instead (:issue:`29801`) +- Removed the previously deprecated keyword "order" from :func:`factorize` (:issue:`19751`) +- Removed the previously deprecated keyword "encoding" from :func:`read_stata` and :meth:`DataFrame.to_stata` (:issue:`21400`) +- Changed the default "sort" argument in :func:`concat` from ``None`` to ``False`` (:issue:`20613`) +- Removed the previously deprecated keyword "raise_conflict" from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) +- Removed the previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) +- Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`) +- Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`) +- Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`) +- Removed ``Series.nonzero``, use ``to_numpy().nonzero()`` instead (:issue:`24048`) +- Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`) +- Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`) +- Removed ``Series.put`` (:issue:`27106`) +- Removed ``Series.real``, ``Series.imag`` (:issue:`27106`) +- Removed ``Series.to_dense``, ``DataFrame.to_dense`` (:issue:`26684`) +- Removed ``Index.dtype_str``, use ``str(index.dtype)`` instead (:issue:`27106`) +- :meth:`Categorical.ravel` returns a :class:`Categorical` instead of a ``ndarray`` (:issue:`27199`) +- The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) +- Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) +- Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) +- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) +- Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) +- Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) +- Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) +- Changed :meth:`Timedelta.resolution` to match the behavior of the standard library ``datetime.timedelta.resolution``, for the old behavior, use :meth:`Timedelta.resolution_string` (:issue:`26839`) +- Removed ``Timestamp.weekday_name``, ``DatetimeIndex.weekday_name``, and ``Series.dt.weekday_name`` (:issue:`18164`) +- Removed the previously deprecated keyword "errors" in :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` (:issue:`22644`) +- Changed the default "ordered" argument in :class:`CategoricalDtype` from ``None`` to ``False`` (:issue:`26336`) +- :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` now require "labels" as the first argument and "axis" as an optional named parameter (:issue:`30089`) +- Removed ``to_msgpack``, ``read_msgpack``, ``DataFrame.to_msgpack``, ``Series.to_msgpack`` (:issue:`27103`) +- Removed ``Series.compress`` (:issue:`21930`) +- Removed the previously deprecated keyword "fill_value" from :meth:`Categorical.fillna`, use "value" instead (:issue:`19269`) +- Removed the previously deprecated keyword "data" from :func:`andrews_curves`, use "frame" instead (:issue:`6956`) +- Removed the previously deprecated keyword "data" from :func:`parallel_coordinates`, use "frame" instead (:issue:`6956`) +- Removed the previously deprecated keyword "colors" from :func:`parallel_coordinates`, use "color" instead (:issue:`6956`) +- Removed the previously deprecated keywords "verbose" and "private_key" from :func:`read_gbq` (:issue:`30200`) +- Calling ``np.array`` and ``np.asarray`` on tz-aware :class:`Series` and :class:`DatetimeIndex` will now return an object array of tz-aware :class:`Timestamp` (:issue:`24596`) - -.. _whatsnew_1000.performance: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`) - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) -- Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) +- Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement when initializing a :class:`DataFrame` using a ``range`` (:issue:`30171`) - Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) - Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) - Performance improvement in :meth:`DataFrame.select_dtypes` by using vectorization instead of iterating over a loop (:issue:`28317`) - Performance improvement in :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` (:issue:`28795`) +- Performance improvement when comparing a :class:`Categorical` with a scalar and the scalar is not found in the categories (:issue:`29750`) +- Performance improvement when checking if values in a :class:`Categorical` are equal, equal or larger or larger than a given scalar. + The improvement is not present if checking if the :class:`Categorical` is less than or less than or equal than the scalar (:issue:`29820`) +- Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) +- Performance improvement in :func:`~pandas.api.types.infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) + +.. --------------------------------------------------------------------------- -.. _whatsnew_1000.bug_fixes: +.. _whatsnew_100.bug_fixes: Bug fixes ~~~~~~~~~ @@ -280,7 +917,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) +- Added test to assert the :func:`fillna` raises the correct ``ValueError`` message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) - :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) @@ -288,6 +925,16 @@ Categorical - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) - Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) - Changed the error message in :meth:`Categorical.remove_categories` to always show the invalid removals as a set (:issue:`28669`) +- Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the + same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a + :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue:`27952`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) +- Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception (:issue:`30227`) +- The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + * :meth:`core.groupby.SeriesGroupBy.count` + * :meth:`core.groupby.SeriesGroupBy.size` + * :meth:`core.groupby.SeriesGroupBy.nunique` + * :meth:`core.groupby.SeriesGroupBy.nth` Datetimelike @@ -296,18 +943,35 @@ Datetimelike - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :func:`to_datetime` where passing arrays of malformed ``str`` with errors="coerce" could incorrectly lead to raising ``ValueError`` (:issue:`28299`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :meth:`core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) +- Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) -- Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) +- Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) +- Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) +- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) +- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) +- Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) +- Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) +- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) +- Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) +- Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) +- Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) +- Bug in :meth:`DataFrame.append` would remove the timezone-awareness of new data (:issue:`30238`) +- Bug in :meth:`Series.cummin` and :meth:`Series.cummax` with timezone-aware dtype incorrectly dropping its timezone (:issue:`15553`) +- Bug in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` where inplace addition and subtraction did not actually operate inplace (:issue:`24115`) +- Bug in :func:`pandas.to_datetime` when called with ``Series`` storing ``IntegerArray`` raising ``TypeError`` instead of returning ``Series`` (:issue:`30050`) +- Bug in :func:`date_range` with custom business hours as ``freq`` and given number of ``periods`` (:issue:`30593`) +- Bug in :class:`PeriodIndex` comparisons with incorrectly casting integers to :class:`Period` objects, inconsistent with the :class:`Period` comparison behavior (:issue:`30722`) +- Bug in :meth:`DatetimeIndex.insert` raising a ``ValueError`` instead of a ``TypeError`` when trying to insert a timezone-aware :class:`Timestamp` into a timezone-naive :class:`DatetimeIndex`, or vice-versa (:issue:`30806`) Timedelta ^^^^^^^^^ - +- Bug in subtracting a :class:`TimedeltaIndex` or :class:`TimedeltaArray` from a ``np.datetime64`` object (:issue:`29558`) - - @@ -321,10 +985,19 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`) -- :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) +- :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth:`DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) -- +- Bug in :meth:`Series.var` not computing the right value with a nullable integer dtype series not passing through ddof argument (:issue:`29128`) +- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) +- Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) +- Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`) +- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) +- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) +- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) +- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) Conversion ^^^^^^^^^^ @@ -335,15 +1008,18 @@ Conversion Strings ^^^^^^^ -- +- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty ``Series`` would return an ``object`` dtype instead of ``bool`` (:issue:`29624`) - Interval ^^^^^^^^ -- -- +- Bug in :meth:`IntervalIndex.get_indexer` where a :class:`Categorical` or :class:`CategoricalIndex` ``target`` would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in ``pandas.core.dtypes.cast.infer_dtype_from_scalar`` where passing ``pandas_dtype=True`` did not infer :class:`IntervalDtype` (:issue:`30337`) +- Bug in :class:`Series` constructor where constructing a ``Series`` from a ``list`` of :class:`Interval` objects resulted in ``object`` dtype instead of :class:`IntervalDtype` (:issue:`23563`) +- Bug in :class:`IntervalDtype` where the ``kind`` attribute was incorrectly set as ``None`` instead of ``"O"`` (:issue:`30568`) +- Bug in :class:`IntervalIndex`, :class:`~arrays.IntervalArray`, and :class:`Series` with interval data where equality comparisons were incorrect (:issue:`24112`) Indexing ^^^^^^^^ @@ -354,8 +1030,12 @@ Indexing - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) -- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) -- +- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) +- :meth:`Index.get_indexer_non_unique` could fail with ``TypeError`` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) +- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`) +- Bug in :meth:`Series.__setitem__` incorrectly assigning values with boolean indexer when the length of new data matches the number of ``True`` values and new data is not a ``Series`` or an ``np.array`` (:issue:`30567`) +- Bug in indexing with a :class:`PeriodIndex` incorrectly accepting integers representing years, use e.g. ``ser.loc["2007"]`` instead of ``ser.loc[2007]`` (:issue:`30763`) Missing ^^^^^^^ @@ -366,8 +1046,8 @@ Missing MultiIndex ^^^^^^^^^^ -- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) -- +- Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) +- Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) - I/O @@ -386,54 +1066,86 @@ I/O - Bug in :meth:`DataFrame.read_excel` with ``engine='ods'`` when ``sheet_name`` argument references a non-existent sheet (:issue:`27676`) - Bug in :meth:`pandas.io.formats.style.Styler` formatting for floating values not displaying decimals correctly (:issue:`13257`) - Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) +- Bug in :meth:`Styler.background_gradient` not able to work with dtype ``Int64`` (:issue:`28869`) +- Bug in :meth:`DataFrame.to_clipboard` which did not work reliably in ipython (:issue:`22707`) +- Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) +- Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) +- :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) +- :func:`read_excel` now accepts binary data (:issue:`15914`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ - Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) -- - Bug in :meth:`DataFrame.plot` not able to plot when no rows (:issue:`27758`) - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) - Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) -- Bug where :meth:`DataFrame.boxplot` would not accept a `color` parameter like `DataFrame.plot.box` (:issue:`26214`) +- Bug where :meth:`DataFrame.boxplot` would not accept a ``color`` parameter like :meth:`DataFrame.plot.box` (:issue:`26214`) - Bug in the ``xticks`` argument being ignored for :meth:`DataFrame.plot.bar` (:issue:`14119`) - :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) +- :meth:`DataFrame.plot` now allow a ``backend`` keyword argument to allow changing between backends in one session (:issue:`28619`). +- Bug in color validation incorrectly raising for non-color styles (:issue:`29122`). +- Allow :meth:`DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) +- Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) -- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) -- Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue: `15584`). -- Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue: `19248`). +- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) +- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) +- Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`). +- Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) -- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`core.groupby.DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) +- :meth:`core.groupby.SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) +- Bug in :meth:`core.window.rolling.Rolling.quantile` ignoring ``interpolation`` keyword argument when used within a groupby (:issue:`28779`) +- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) +- Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) +- Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) +- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) +- Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`) -- Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) +- Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue:`28301`) +- Bug in :meth:`pivot_table` not returning correct type ``float`` when ``margins=True`` and ``aggfunc='mean'`` (:issue:`24893`) - Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`) - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`) +- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ``ValueError`` (:issue:`28664`) +- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) +- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) +- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) +- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) +- Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) +- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) +- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) -- +- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`) - ExtensionArray ^^^^^^^^^^^^^^ - Bug in :class:`arrays.PandasArray` when setting a scalar string (:issue:`28118`, :issue:`28150`). -- +- Bug where nullable integers could not be compared to strings (:issue:`28930`) +- Bug where :class:`DataFrame` constructor raised ``ValueError`` with list-like data and ``dtype`` specified (:issue:`30280`) Other @@ -444,11 +1156,24 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`) +- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) +- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) - - -.. _whatsnew_1000.contributors: +- Fixed :class:`~arrays.IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by ``0`` (:issue:`27398`) +- Fixed ``pow`` operations for :class:`~arrays.IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) +- Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) +- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`) +- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`) +- Bug in :meth:`DataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`) +- Bug where :meth:`DataFrame.itertuples` would incorrectly determine whether or not namedtuples could be used for dataframes of 255 columns (:issue:`28282`) +- Handle nested NumPy ``object`` arrays in :func:`testing.assert_series_equal` for ExtensionArray implementations (:issue:`30841`) +- Bug in :class:`Index` constructor incorrectly allowing 2-dimensional input arrays (:issue:`13601`, :issue:`27125`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.contributors: Contributors ~~~~~~~~~~~~ diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 1a5ab99b5a94f..fdc5a6b283ba8 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -113,13 +113,13 @@ def build_string(revision_range, heading="Contributors"): components["authors"] = "* " + "\n* ".join(components["authors"]) tpl = textwrap.dedent( - """\ - {heading} - {uline} + f"""\ + {components['heading']} + {components['uline']} - {author_message} - {authors}""" - ).format(**components) + {components['author_message']} + {components['authors']}""" + ) return tpl diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 1a064f71792e9..d9ba2bb2cfb07 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -27,7 +27,7 @@ def run(self): except git.GitCommandError as exc: return [ self.state.document.reporter.warning( - "Cannot find contributors for range '{}': {}".format(range_, exc), + f"Cannot find contributors for range {repr(range_)}: {exc}", line=self.lineno, ) ] diff --git a/environment.yml b/environment.yml index 163bd08b93c9e..e244350a0bea0 100644 --- a/environment.yml +++ b/environment.yml @@ -1,11 +1,10 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: # required - numpy>=1.15 - - python=3 + - python=3.7 - python-dateutil>=2.6.1 - pytz @@ -16,13 +15,13 @@ dependencies: - cython>=0.29.13 # code checks - - black + - black=19.10b0 - cpplint - flake8 - - flake8-comprehensions # used by flake8, linting of unnecessary comprehensions + - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort # check that imports are in the right order - - mypy + - mypy=0.730 - pycodestyle # used by flake8 # documentation @@ -34,7 +33,8 @@ dependencies: - nbconvert>=5.4.1 - nbsphinx - pandoc - # Dask and its dependencies + + # Dask and its dependencies (that dont install with dask) - dask-core - toolz>=0.7.3 - fsspec>=0.5.1 @@ -52,10 +52,12 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=4.0.2 + - pytest>=5.0.1 - pytest-cov - - pytest-mock - - pytest-xdist + - pytest-xdist>=1.21 + - pytest-asyncio + + # downstream tests - seaborn - statsmodels @@ -69,29 +71,37 @@ dependencies: - blosc - bottleneck>=1.2.1 - ipykernel - - ipython>=5.6.0 + - ipython>=7.11.1 - jinja2 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.1 + - numba>=0.46.0 # optional for io - - beautifulsoup4>=4.6.0 # pandas.read_html - - fastparquet>=0.2.1 # pandas.read_parquet, DataFrame.to_parquet - - html5lib # pandas.read_html - - lxml # pandas.read_html - - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - pyarrow>=0.13.1 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + # --------------- + # pd.read_html + - beautifulsoup4>=4.6.0 + - html5lib + - lxml + + # pd.read_excel, DataFrame.to_excel, pd.ExcelWriter, pd.ExcelFile + - openpyxl<=3.0.1 + - xlrd + - xlsxwriter + - xlwt + - odfpy + + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet + - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - python-snappy # required by pyarrow + - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - - python-snappy # required by pyarrow - s3fs # pandas.read_csv... when using 's3://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - odfpy # pandas.read_excel - pyreadstat # pandas.read_spss + - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master diff --git a/pandas/__init__.py b/pandas/__init__.py index 5d163e411c0ac..491bcb21f245d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,7 +10,7 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append("{0}: {1}".format(dependency, str(e))) + missing_dependencies.append(f"{dependency}: {e}") if missing_dependencies: raise ImportError( @@ -24,6 +24,7 @@ _np_version_under1p15, _np_version_under1p16, _np_version_under1p17, + _np_version_under1p18, ) try: @@ -32,14 +33,12 @@ # hack but overkill to use re module = str(e).replace("cannot import name ", "") raise ImportError( - "C extension: {0} not built. If you want to import " + f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " "'python setup.py build_ext --inplace --force' to build " - "the C extensions first.".format(module) + "the C extensions first." ) -from datetime import datetime - from pandas._config import ( get_option, set_option, @@ -67,7 +66,9 @@ IntervalDtype, DatetimeTZDtype, StringDtype, + BooleanDtype, # missing + NA, isna, isnull, notna, @@ -102,7 +103,6 @@ to_datetime, to_timedelta, # misc - np, Grouper, factorize, unique, @@ -115,7 +115,7 @@ DataFrame, ) -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -138,6 +138,7 @@ qcut, ) +import pandas.api from pandas.util._print_versions import show_versions from pandas.io.api import ( @@ -145,9 +146,6 @@ ExcelFile, ExcelWriter, read_excel, - # packers - read_msgpack, - to_msgpack, # parsers read_csv, read_fwf, @@ -165,6 +163,7 @@ # misc read_clipboard, read_parquet, + read_orc, read_feather, read_gbq, read_html, @@ -174,6 +173,8 @@ read_spss, ) +from pandas.io.json import _json_normalize as json_normalize + from pandas.util._tester import test import pandas.testing import pandas.arrays @@ -186,7 +187,6 @@ __git_version__ = v.get("full-revisionid") del get_versions, v - # GH 27101 # TODO: remove Panel compat in 1.0 if pandas.compat.PY37: @@ -208,18 +208,58 @@ class Panel: pass return Panel + + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) + + from datetime import datetime as dt + + return dt + + elif name == "np": + + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np + + return np + elif name in {"SparseSeries", "SparseDataFrame"}: warnings.warn( - "The {} class is removed from pandas. Accessing it from " + f"The {name} class is removed from pandas. Accessing it from " "the top-level namespace will also be removed in the next " - "version".format(name), + "version", FutureWarning, stacklevel=2, ) return type(name, (), {}) - raise AttributeError("module 'pandas' has no attribute '{}'".format(name)) + elif name == "SparseArray": + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray + + return _SparseArray + + raise AttributeError(f"module 'pandas' has no attribute '{name}'") else: @@ -233,6 +273,96 @@ class SparseDataFrame: class SparseSeries: pass + class __numpy: + def __init__(self): + import numpy as np + import warnings + + self.np = np + self.warnings = warnings + + def __getattr__(self, item): + self.warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + + try: + return getattr(self.np, item) + except AttributeError: + raise AttributeError(f"module numpy has no attribute {item}") + + np = __numpy() + + class __Datetime(type): + + from datetime import datetime as dt + + datetime = dt + + def __getattr__(cls, item): + cls.emit_warning() + + try: + return getattr(cls.datetime, item) + except AttributeError: + raise AttributeError(f"module datetime has no attribute {item}") + + def __instancecheck__(cls, other): + return isinstance(other, cls.datetime) + + class __DatetimeSub(metaclass=__Datetime): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from datetime import datetime as dt + + return dt(*args, **kwargs) + + datetime = __DatetimeSub + + class __SparseArray(type): + + from pandas.core.arrays.sparse import SparseArray as sa + + SparseArray = sa + + def __instancecheck__(cls, other): + return isinstance(other, cls.SparseArray) + + class __SparseArraySub(metaclass=__SparseArray): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from pandas.core.arrays.sparse import SparseArray as sa + + return sa(*args, **kwargs) + + SparseArray = __SparseArraySub + # module level doc-string __doc__ = """ @@ -272,6 +402,5 @@ class SparseSeries: Excel files, databases, and saving/loading data from the ultrafast HDF5 format. - Time series-specific functionality: date range generation and frequency - conversion, moving window statistics, moving window linear regressions, - date shifting and lagging, etc. + conversion, moving window statistics, date shifting and lagging. """ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 890db5b41907e..0a3009f74492f 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -51,23 +51,23 @@ from collections import namedtuple from contextlib import contextmanager import re -from typing import Dict, List +from typing import Any, Dict, Iterable, List import warnings DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") -# holds deprecated option metdata -_deprecated_options = {} # type: Dict[str, DeprecatedOption] +# holds deprecated option metadata +_deprecated_options: Dict[str, DeprecatedOption] = {} -# holds registered option metdata -_registered_options = {} # type: Dict[str, RegisteredOption] +# holds registered option metadata +_registered_options: Dict[str, RegisteredOption] = {} # holds the current values for registered options -_global_config = {} # type: Dict[str, str] +_global_config: Dict[str, Any] = {} # keys which have a special meaning -_reserved_keys = ["all"] # type: List[str] +_reserved_keys: List[str] = ["all"] class OptionError(AttributeError, KeyError): @@ -85,7 +85,7 @@ def _get_single_key(pat, silent): if len(keys) == 0: if not silent: _warn_if_deprecated(pat) - raise OptionError("No such keys(s): {pat!r}".format(pat=pat)) + raise OptionError(f"No such keys(s): {repr(pat)}") if len(keys) > 1: raise OptionError("Pattern matched multiple keys") key = keys[0] @@ -116,8 +116,8 @@ def _set_option(*args, **kwargs): silent = kwargs.pop("silent", False) if kwargs: - msg = '_set_option() got an unexpected keyword argument "{kwarg}"' - raise TypeError(msg.format(list(kwargs.keys())[0])) + kwarg = list(kwargs.keys())[0] + raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"') for k, v in zip(args[::2], args[1::2]): key = _get_single_key(k, silent) @@ -197,7 +197,7 @@ def __setattr__(self, key, val): else: raise OptionError("You can only set the value of existing options") - def __getattr__(self, key): + def __getattr__(self, key: str): prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." @@ -412,7 +412,7 @@ def __exit__(self, *args): _set_option(pat, val, silent=True) -def register_option(key, defval, doc="", validator=None, cb=None): +def register_option(key: str, defval: object, doc="", validator=None, cb=None): """Register an option in the package-wide pandas config object Parameters @@ -441,11 +441,9 @@ def register_option(key, defval, doc="", validator=None, cb=None): key = key.lower() if key in _registered_options: - msg = "Option '{key}' has already been registered" - raise OptionError(msg.format(key=key)) + raise OptionError(f"Option '{key}' has already been registered") if key in _reserved_keys: - msg = "Option '{key}' is a reserved key" - raise OptionError(msg.format(key=key)) + raise OptionError(f"Option '{key}' is a reserved key") # the default value should be legal if validator: @@ -455,13 +453,16 @@ def register_option(key, defval, doc="", validator=None, cb=None): path = key.split(".") for k in path: - if not bool(re.match("^" + tokenize.Name + "$", k)): - raise ValueError("{k} is not a valid identifier".format(k=k)) + # NOTE: tokenize.Name is not a public constant + # error: Module has no attribute "Name" [attr-defined] + if not re.match("^" + tokenize.Name + "$", k): # type: ignore + raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): - raise ValueError("{k} is a python keyword".format(k=k)) + raise ValueError(f"{k} is a python keyword") cursor = _global_config msg = "Path prefix to option '{option}' is already an option" + for i, p in enumerate(path[:-1]): if not isinstance(cursor, dict): raise OptionError(msg.format(option=".".join(path[:i]))) @@ -522,8 +523,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): key = key.lower() if key in _deprecated_options: - msg = "Option '{key}' has already been defined as deprecated." - raise OptionError(msg.format(key=key)) + raise OptionError(f"Option '{key}' has already been defined as deprecated.") _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) @@ -621,11 +621,11 @@ def _warn_if_deprecated(key): print(d.msg) warnings.warn(d.msg, FutureWarning) else: - msg = "'{key}' is deprecated".format(key=key) + msg = f"'{key}' is deprecated" if d.removal_ver: - msg += " and will be removed in {version}".format(version=d.removal_ver) + msg += f" and will be removed in {d.removal_ver}" if d.rkey: - msg += ", please use '{rkey}' instead.".format(rkey=d.rkey) + msg += f", please use '{d.rkey}' instead." else: msg += ", please refrain from using it." @@ -640,7 +640,7 @@ def _build_option_description(k): o = _get_registered_option(k) d = _get_deprecated_option(k) - s = "{k} ".format(k=k) + s = f"{k} " if o.doc: s += "\n".join(o.doc.strip().split("\n")) @@ -648,13 +648,12 @@ def _build_option_description(k): s += "No description available." if o: - s += "\n [default: {default}] [currently: {current}]".format( - default=o.defval, current=_get_option(k, True) - ) + s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" if d: + rkey = d.rkey if d.rkey else "" s += "\n (Deprecated" - s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "") + s += f", use `{rkey}` instead." s += ")" return s @@ -666,7 +665,7 @@ def pp_options_list(keys, width=80, _print=False): from textwrap import wrap from itertools import groupby - def pp(name, ks): + def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" ls = wrap( ", ".join(ks), @@ -679,7 +678,7 @@ def pp(name, ks): ls[-1] = ls[-1] + "]" return ls - ls = [] + ls: List[str] = [] singles = [x for x in sorted(keys) if x.find(".") < 0] if singles: ls += pp("", singles) @@ -731,7 +730,7 @@ def config_prefix(prefix): def wrap(func): def inner(key, *args, **kwds): - pkey = "{prefix}.{key}".format(prefix=prefix, key=key) + pkey = f"{prefix}.{key}" return func(pkey, *args, **kwds) return inner @@ -768,8 +767,7 @@ def is_type_factory(_type): def inner(x): if type(x) != _type: - msg = "Value must have type '{typ!s}'" - raise ValueError(msg.format(typ=_type)) + raise ValueError(f"Value must have type '{_type}'") return inner @@ -792,12 +790,11 @@ def is_instance_factory(_type): _type = tuple(_type) type_repr = "|".join(map(str, _type)) else: - type_repr = "'{typ}'".format(typ=_type) + type_repr = f"'{_type}'" def inner(x): if not isinstance(x, _type): - msg = "Value must be an instance of {type_repr}" - raise ValueError(msg.format(type_repr=type_repr)) + raise ValueError(f"Value must be an instance of {type_repr}") return inner @@ -813,10 +810,10 @@ def inner(x): if not any(c(x) for c in callables): uvals = [str(lval) for lval in legal_values] pp_values = "|".join(uvals) - msg = "Value must be one of {pp_values}" + msg = f"Value must be one of {pp_values}" if len(callables): msg += " or a callable" - raise ValueError(msg.format(pp_values=pp_values)) + raise ValueError(msg) return inner diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index ba60b1e003004..dd1d4948aa6e3 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -161,6 +161,6 @@ def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_gette if prefix is None: return _valid_locales(out_locales, normalize) - pattern = re.compile("{prefix}.*".format(prefix=prefix)) + pattern = re.compile(f"{prefix}.*") found = pattern.findall("\n".join(out_locales)) return _valid_locales(found, normalize) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 30c9af645da22..7a2fc9dc7845a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -49,8 +49,9 @@ cdef inline bint are_diff(object left, object right): class Infinity: - """ provide a positive Infinity comparison method for ranking """ - + """ + Provide a positive Infinity comparison method for ranking. + """ __lt__ = lambda self, other: False __le__ = lambda self, other: isinstance(other, Infinity) __eq__ = lambda self, other: isinstance(other, Infinity) @@ -61,8 +62,9 @@ class Infinity: class NegInfinity: - """ provide a negative Infinity comparison method for ranking """ - + """ + Provide a negative Infinity comparison method for ranking. + """ __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and not missing.checknull(other)) __le__ = lambda self, other: not missing.checknull(other) @@ -84,8 +86,8 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr): Returns ------- - result : ndarray[int64_t] - result is sorted + ndarray[int64_t] + An ordered ndarray[int64_t] """ cdef: Py_ssize_t i, n = len(arr) @@ -150,20 +152,27 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.wraparound(False) def groupsort_indexer(const int64_t[:] index, Py_ssize_t ngroups): """ - compute a 1-d indexer that is an ordering of the passed index, - ordered by the groups. This is a reverse of the label - factorization process. + Compute a 1-d indexer. + + The indexer is an ordering of the passed index, + ordered by the groups. Parameters ---------- index: int64 ndarray - mappings from group -> position + Mappings from group -> position. ngroups: int64 - number of groups + Number of groups. - return a tuple of (1-d indexer ordered by groups, group counts) - """ + Returns + ------- + tuple + 1-d indexer ordered by groups, group counts. + Notes + ----- + This is a reverse of the label factorization process. + """ cdef: Py_ssize_t i, loc, label, n ndarray[int64_t] counts, where, result @@ -379,13 +388,39 @@ ctypedef fused algos_t: uint8_t +def _validate_limit(nobs: int, limit=None) -> int: + """ + Check that the `limit` argument is a positive integer. + + Parameters + ---------- + nobs : int + limit : object + + Returns + ------- + int + The limit. + """ + if limit is None: + lim = nobs + else: + if not util.is_integer_object(limit): + raise ValueError('Limit must be an integer') + if limit < 1: + raise ValueError('Limit must be greater than 0') + lim = limit + + return lim + + @cython.boundscheck(False) @cython.wraparound(False) def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t, ndim=1] indexer - algos_t cur, next + algos_t cur, next_val int lim, fill_count = 0 nleft = len(old) @@ -393,14 +428,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - if limit is None: - lim = nright - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer @@ -426,9 +454,9 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): j += 1 break - next = old[i + 1] + next_val = old[i + 1] - while j < nright and cur <= new[j] < next: + while j < nright and cur <= new[j] < next_val: if new[j] == cur: indexer[j] = i elif fill_count < lim: @@ -438,16 +466,14 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): fill_count = 0 i += 1 - cur = next + cur = next_val return indexer @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, - const uint8_t[:] mask, - limit=None): +def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val @@ -459,14 +485,7 @@ def pad_inplace(algos_t[:] values, if N == 0: return - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(N, limit) val = values[0] for i in range(N): @@ -482,9 +501,7 @@ def pad_inplace(algos_t[:] values, @cython.boundscheck(False) @cython.wraparound(False) -def pad_2d_inplace(algos_t[:, :] values, - const uint8_t[:, :] mask, - limit=None): +def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): cdef: Py_ssize_t i, j, N, K algos_t val @@ -496,14 +513,7 @@ def pad_2d_inplace(algos_t[:, :] values, if N == 0: return - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -559,14 +569,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - if limit is None: - lim = nright - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: return indexer @@ -612,9 +615,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace(algos_t[:] values, - const uint8_t[:] mask, - limit=None): +def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val @@ -626,14 +627,7 @@ def backfill_inplace(algos_t[:] values, if N == 0: return - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(N, limit) val = values[N - 1] for i in range(N - 1, -1, -1): @@ -663,14 +657,7 @@ def backfill_2d_inplace(algos_t[:, :] values, if N == 0: return - if limit is None: - lim = N - else: - if not util.is_integer_object(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - lim = limit + lim = _validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -692,7 +679,10 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): """ Returns ------- - is_monotonic_inc, is_monotonic_dec, is_unique + tuple + is_monotonic_inc : bool + is_monotonic_dec : bool + is_unique : bool """ cdef: Py_ssize_t i, n @@ -787,9 +777,8 @@ ctypedef fused rank_t: def rank_1d(rank_t[:] in_arr, ties_method='average', ascending=True, na_option='keep', pct=False): """ - Fast NaN-friendly version of scipy.stats.rankdata + Fast NaN-friendly version of ``scipy.stats.rankdata``. """ - cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 @@ -1008,9 +997,8 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): """ - Fast NaN-friendly version of scipy.stats.rankdata + Fast NaN-friendly version of ``scipy.stats.rankdata``. """ - cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 @@ -1173,6 +1161,77 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', return ranks +ctypedef fused diff_t: + float64_t + float32_t + int8_t + int16_t + int32_t + int64_t + +ctypedef fused out_t: + float32_t + float64_t + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d(ndarray[diff_t, ndim=2] arr, + ndarray[out_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy, start, stop + bint f_contig = arr.flags.f_contiguous + + # Disable for unsupported dtype combinations, + # see https://github.com/cython/cython/issues/2646 + if (out_t is float32_t + and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): + raise NotImplementedError + elif (out_t is float64_t + and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): + raise NotImplementedError + else: + # We put this inside an indented else block to avoid cython build + # warnings about unreachable code + sx, sy = (arr).shape + with nogil: + if f_contig: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + # generated from template include "algos_common_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index c3b0a84067f92..5bfc594602dd8 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -4,75 +4,6 @@ Template for each `dtype` helper function using 1-d template WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -ctypedef fused diff_t: - float64_t - float32_t - int8_t - int16_t - int32_t - int64_t - -ctypedef fused out_t: - float32_t - float64_t - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d(ndarray[diff_t, ndim=2] arr, - ndarray[out_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - # Disable for unsupported dtype combinations, - # see https://github.com/cython/cython/issues/2646 - if (out_t is float32_t - and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): - raise NotImplementedError - elif (out_t is float64_t - and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): - raise NotImplementedError - else: - # We put this inside an indented else block to avoid cython build - # warnings about unreachable code - sx, sy = (arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - # ---------------------------------------------------------------------- # ensure_dtype # ---------------------------------------------------------------------- diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 9dbae8170cbd0..995fabbedcb5d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -10,69 +10,119 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# c_type_in, c_type_out, preval, postval +# c_type_in, c_type_out dtypes = [ - ('uint8_t', 'uint8_t', '', ''), - ('uint8_t', 'object', 'True if ', ' > 0 else False'), - ('int8_t', 'int8_t', '', ''), - ('int8_t', 'int32_t', '', ''), - ('int8_t', 'int64_t', '', ''), - ('int8_t', 'float64_t', '', ''), - ('int16_t', 'int16_t', '', ''), - ('int16_t', 'int32_t', '', ''), - ('int16_t', 'int64_t', '', ''), - ('int16_t', 'float64_t', '', ''), - ('int32_t', 'int32_t', '', ''), - ('int32_t', 'int64_t', '', ''), - ('int32_t', 'float64_t', '', ''), - ('int64_t', 'int64_t', '', ''), - ('int64_t', 'float64_t', '', ''), - ('float32_t', 'float32_t', '', ''), - ('float32_t', 'float64_t', '', ''), - ('float64_t', 'float64_t', '', ''), - ('object', 'object', '', ''), + ('uint8_t', 'uint8_t'), + ('uint8_t', 'object'), + ('int8_t', 'int8_t'), + ('int8_t', 'int32_t'), + ('int8_t', 'int64_t'), + ('int8_t', 'float64_t'), + ('int16_t', 'int16_t'), + ('int16_t', 'int32_t'), + ('int16_t', 'int64_t'), + ('int16_t', 'float64_t'), + ('int32_t', 'int32_t'), + ('int32_t', 'int64_t'), + ('int32_t', 'float64_t'), + ('int64_t', 'int64_t'), + ('int64_t', 'float64_t'), + ('float32_t', 'float32_t'), + ('float32_t', 'float64_t'), + ('float64_t', 'float64_t'), + ('object', 'object'), ] def get_dispatch(dtypes): - inner_take_1d_template = """ + for (c_type_in, c_type_out) in dtypes: + + def get_name(dtype_name): + if dtype_name == "object": + return "object" + if dtype_name == "uint8_t": + return "bool" + return dtype_name[:-2] + + name = get_name(c_type_in) + dest = get_name(c_type_out) + + args = dict(name=name, dest=dest, c_type_in=c_type_in, + c_type_out=c_type_out) + + yield (name, dest, c_type_in, c_type_out) + +}} + + +{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values, +{{else}} +def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, +{{endif}} + const int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + cdef: Py_ssize_t i, n, idx - %(c_type_out)s fv + {{c_type_out}} fv n = indexer.shape[0] fv = fill_value - %(nogil_str)s - %(tab)sfor i in range(n): - %(tab)s idx = indexer[i] - %(tab)s if idx == -1: - %(tab)s out[i] = fv - %(tab)s else: - %(tab)s out[i] = %(preval)svalues[idx]%(postval)s -""" + {{if c_type_out != "object"}} + with nogil: + {{else}} + if True: + {{endif}} + for i in range(n): + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i] = True if values[idx] > 0 else False + {{else}} + out[i] = values[idx] + {{endif}} + - inner_take_2d_axis0_template = """\ +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, +{{else}} +def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, +{{endif}} + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): cdef: Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv + {{c_type_out}} fv n = len(indexer) k = values.shape[1] fv = fill_value - IF %(can_copy)s: + IF {{True if c_type_in == c_type_out != "object" else False}}: cdef: - %(c_type_out)s *v - %(c_type_out)s *o + const {{c_type_out}} *v + {{c_type_out}} *o - #GH3130 + # GH#3130 if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(%(c_type_out)s) and - sizeof(%(c_type_out)s) * n >= 256): + values.strides[1] == sizeof({{c_type_out}}) and + sizeof({{c_type_out}}) * n >= 256): for i in range(n): idx = indexer[i] @@ -82,7 +132,7 @@ def get_dispatch(dtypes): else: v = &values[idx, 0] o = &out[i, 0] - memmove(o, v, (sizeof(%(c_type_out)s) * k)) + memmove(o, v, (sizeof({{c_type_out}}) * k)) return for i in range(n): @@ -92,13 +142,27 @@ def get_dispatch(dtypes): out[i, j] = fv else: for j in range(k): - out[i, j] = %(preval)svalues[idx, j]%(postval)s -""" + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[idx, j] > 0 else False + {{else}} + out[i, j] = values[idx, j] + {{endif}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +{{if c_type_in != "object"}} +def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values, +{{else}} +def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, +{{endif}} + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): - inner_take_2d_axis1_template = """\ cdef: Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv + {{c_type_out}} fv n = len(values) k = len(indexer) @@ -114,132 +178,11 @@ def get_dispatch(dtypes): if idx == -1: out[i, j] = fv else: - out[i, j] = %(preval)svalues[i, idx]%(postval)s -""" - - for (c_type_in, c_type_out, preval, postval) in dtypes: - - can_copy = c_type_in == c_type_out != "object" - nogil = c_type_out != "object" - if nogil: - nogil_str = "with nogil:" - tab = ' ' - else: - nogil_str = '' - tab = '' - - def get_name(dtype_name): - if dtype_name == "object": - return "object" - if dtype_name == "uint8_t": - return "bool" - return dtype_name[:-2] - - name = get_name(c_type_in) - dest = get_name(c_type_out) - - args = dict(name=name, dest=dest, c_type_in=c_type_in, - c_type_out=c_type_out, preval=preval, postval=postval, - can_copy=can_copy, nogil_str=nogil_str, tab=tab) - - inner_take_1d = inner_take_1d_template % args - inner_take_2d_axis0 = inner_take_2d_axis0_template % args - inner_take_2d_axis1 = inner_take_2d_axis1_template % args - - yield (name, dest, c_type_in, c_type_out, preval, postval, - inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) - -}} - - -{{for name, dest, c_type_in, c_type_out, preval, postval, - inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 - in get_dispatch(dtypes)}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, - const int64_t[:] indexer, - {{c_type_out}}[:] out, - fill_value=np.nan): - - -{{inner_take_1d}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, - const int64_t[:] indexer, - {{c_type_out}}[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_1d}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - const int64_t[:] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): -{{inner_take_2d_axis0}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, - ndarray[int64_t] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_2d_axis0}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, - const int64_t[:] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): -{{inner_take_2d_axis1}} - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, - ndarray[int64_t] indexer, - {{c_type_out}}[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -{{inner_take_2d_axis1}} + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[i, idx] > 0 else False + {{else}} + out[i, j] = values[i, idx] + {{endif}} @cython.wraparound(False) @@ -268,7 +211,11 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, if idx1[j] == -1: out[i, j] = fv else: - out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} + {{if c_type_in == "uint8_t" and c_type_out == "object"}} + out[i, j] = True if values[idx, idx1[j]] > 0 else False + {{else}} + out[i, j] = values[idx, idx1[j]] + {{endif}} {{endfor}} diff --git a/pandas/_libs/groupby.pxd b/pandas/_libs/groupby.pxd deleted file mode 100644 index 70ad8a62871e9..0000000000000 --- a/pandas/_libs/groupby.pxd +++ /dev/null @@ -1,6 +0,0 @@ -cdef enum InterpolationEnumType: - INTERPOLATION_LINEAR, - INTERPOLATION_LOWER, - INTERPOLATION_HIGHER, - INTERPOLATION_NEAREST, - INTERPOLATION_MIDPOINT diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 49a335218db96..abb8a6d388d26 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -27,6 +27,13 @@ _int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN +cdef enum InterpolationEnumType: + INTERPOLATION_LINEAR, + INTERPOLATION_LOWER, + INTERPOLATION_HIGHER, + INTERPOLATION_NEAREST, + INTERPOLATION_MIDPOINT + cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef: @@ -168,7 +175,6 @@ def group_cumprod_float64(float64_t[:, :] out, ----- This method modifies the `out` parameter, rather than returning an object. """ - cdef: Py_ssize_t i, j, N, K, size float64_t val @@ -226,7 +232,6 @@ def group_cumsum(numeric[:, :] out, ----- This method modifies the `out` parameter, rather than returning an object. """ - cdef: Py_ssize_t i, j, N, K, size numeric val @@ -746,8 +751,7 @@ def group_quantile(ndarray[float64_t] out, assert values.shape[0] == N if not (0 <= q <= 1): - raise ValueError("'q' must be between 0 and 1. Got" - " '{}' instead".format(q)) + raise ValueError(f"'q' must be between 0 and 1. Got '{q}' instead") inter_methods = { 'linear': INTERPOLATION_LINEAR, @@ -787,7 +791,7 @@ def group_quantile(ndarray[float64_t] out, out[i] = NaN else: # Calculate where to retrieve the desired value - # Casting to int will intentionaly truncate result + # Casting to int will intentionally truncate result idx = grp_start + (q * (non_na_sz - 1)) val = values[sort_arr[idx]] @@ -1398,7 +1402,6 @@ def group_cummin(groupby_t[:, :] out, ----- This method modifies the `out` parameter, rather than returning an object. """ - cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval @@ -1459,7 +1462,6 @@ def group_cummax(groupby_t[:, :] out, ----- This method modifies the `out` parameter, rather than returning an object. """ - cdef: Py_ssize_t i, j, N, K, size groupby_t val, mval diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 6b27b2204e75e..5298d8c5ed34e 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -25,13 +25,17 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): Returns ------- - 1-d uint64 ndarray of hashes + 1-d uint64 ndarray of hashes. + + Raises + ------ + TypeError + If the array contains mixed types. Notes ----- - allowed values must be strings, or nulls - mixed array types will raise TypeError - + Allowed values must be strings, or nulls + mixed array types will raise TypeError. """ cdef: Py_ssize_t i, l, n @@ -48,7 +52,7 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): kb = k if len(k) != 16: raise ValueError("key should be a 16-byte string encoded, " - "got {key} (len {klen})".format(key=k, klen=len(k))) + f"got {k} (len {len(k)})") n = len(arr) @@ -66,16 +70,21 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): # null, stringify and encode data = str(val).encode(encoding) + elif isinstance(val, tuple): + # GH#28969 we could have a tuple, but need to ensure that + # the tuple entries are themselves hashable before converting + # to str + hash(val) + data = str(val).encode(encoding) else: - raise TypeError("{val} of type {typ} is not a valid type " - "for hashing, must be string or null" - .format(val=val, typ=type(val))) + raise TypeError(f"{val} of type {type(val)} is not a valid type " + "for hashing, must be string or null") l = len(data) lens[i] = l cdata = data - # keep the references alive thru the end of the + # keep the references alive through the end of the # function datas.append(data) vecs[i] = cdata diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 51ec4ba43159c..0499eabf708af 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -36,8 +36,8 @@ cdef class PyObjectHashTable(HashTable): cdef class StringHashTable(HashTable): cdef kh_str_t *table - cpdef get_item(self, object val) - cpdef set_item(self, object key, Py_ssize_t val) + cpdef get_item(self, str val) + cpdef set_item(self, str key, Py_ssize_t val) cdef struct Int64VectorData: int64_t *data diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 8179822b9e10c..59ba1705d2dbb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,7 +1,7 @@ cimport cython from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free +from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdlib cimport malloc, free @@ -142,7 +142,7 @@ cdef class Int64Factorizer: @cython.boundscheck(False) def unique_label_indices(const int64_t[:] labels): """ - indices of the first occurrences of the unique labels + Indices of the first occurrences of the unique labels *excluding* -1. equivalent to: np.unique(labels, return_index=True)[1] """ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c39d6d60d4ea5..7d57c67e70b58 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -100,7 +100,7 @@ cdef class {{name}}Vector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n cpdef to_array(self): @@ -168,7 +168,7 @@ cdef class StringVector: PyMem_Free(self.data) self.data = NULL - def __len__(self): + def __len__(self) -> int: return self.data.n def to_array(self): @@ -212,7 +212,7 @@ cdef class ObjectVector: self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data - def __len__(self): + def __len__(self) -> int: return self.n cdef inline append(self, object obj): @@ -270,7 +270,7 @@ cdef class {{name}}HashTable(HashTable): size_hint = min(size_hint, _SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) - def __len__(self): + def __len__(self) -> int: return self.table.size def __dealloc__(self): @@ -599,7 +599,7 @@ cdef class StringHashTable(HashTable): sizeof(Py_ssize_t) + # vals sizeof(uint32_t)) # flags - cpdef get_item(self, object val): + cpdef get_item(self, str val): cdef: khiter_t k const char *v @@ -611,16 +611,16 @@ cdef class StringHashTable(HashTable): else: raise KeyError(val) - cpdef set_item(self, object key, Py_ssize_t val): + cpdef set_item(self, str key, Py_ssize_t val): cdef: khiter_t k int ret = 0 const char *v - v = get_c_string(val) + v = get_c_string(key) k = kh_put_str(self.table, v, &ret) - self.table.keys[k] = key + self.table.keys[k] = v if kh_exist_str(self.table, k): self.table.vals[k] = val else: @@ -784,7 +784,7 @@ cdef class StringHashTable(HashTable): labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. - v = get_c_string(val) + v = get_c_string(val) vecs[i] = v # compute @@ -897,7 +897,7 @@ cdef class PyObjectHashTable(HashTable): kh_destroy_pymap(self.table) self.table = NULL - def __len__(self): + def __len__(self) -> int: return self.table.size def __contains__(self, object key): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index c4284ae403e5c..f8f3858b803a5 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -144,13 +144,13 @@ def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: for i in range(n - 1, -1, -1): - # equivalent: range(n)[::-1], which cython doesnt like in nogil + # equivalent: range(n)[::-1], which cython doesn't like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index cc114b48a5b53..ac8172146d351 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -52,7 +52,7 @@ cpdef get_value_at(ndarray arr, object loc, object tz=None): # Don't populate hash tables in monotonic indexes larger than this -_SIZE_CUTOFF = 1000000 +_SIZE_CUTOFF = 1_000_000 cdef class IndexEngine: @@ -79,6 +79,8 @@ cdef class IndexEngine: cpdef get_value(self, ndarray arr, object key, object tz=None): """ + Parameters + ---------- arr : 1-dimensional ndarray """ cdef: @@ -93,6 +95,8 @@ cdef class IndexEngine: cpdef set_value(self, ndarray arr, object key, object value): """ + Parameters + ---------- arr : 1-dimensional ndarray """ cdef: @@ -109,7 +113,7 @@ cdef class IndexEngine: Py_ssize_t loc if is_definitely_invalid_key(val): - raise TypeError("'{val}' is an invalid key".format(val=val)) + raise TypeError(f"'{val}' is an invalid key") if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: @@ -141,8 +145,12 @@ cdef class IndexEngine: if self.is_monotonic_increasing: values = self._get_index_values() - left = values.searchsorted(val, side='left') - right = values.searchsorted(val, side='right') + try: + left = values.searchsorted(val, side='left') + right = values.searchsorted(val, side='right') + except TypeError: + # e.g. GH#29189 get_loc(None) with a Float64Index + raise KeyError(val) diff = right - left if diff == 0: @@ -279,11 +287,12 @@ cdef class IndexEngine: return self.mapping.lookup(values) def get_indexer_non_unique(self, targets): - """ return an indexer suitable for takng from a non unique index - return the labels in the same order ast the target - and a missing indexer into the targets (which correspond - to the -1 indices in the results """ - + """ + Return an indexer suitable for taking from a non unique index + return the labels in the same order ast the target + and a missing indexer into the targets (which correspond + to the -1 indices in the results + """ cdef: ndarray values, x ndarray[int64_t] result, missing @@ -298,8 +307,8 @@ cdef class IndexEngine: stargets = set(targets) n = len(values) n_t = len(targets) - if n > 10000: - n_alloc = 10000 + if n > 10_000: + n_alloc = 10_000 else: n_alloc = n @@ -341,7 +350,7 @@ cdef class IndexEngine: # realloc if needed if count >= n_alloc: - n_alloc += 10000 + n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = j @@ -351,7 +360,7 @@ cdef class IndexEngine: else: if count >= n_alloc: - n_alloc += 10000 + n_alloc += 10_000 result = np.resize(result, n_alloc) result[count] = -1 count += 1 @@ -389,7 +398,7 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: cdef class ObjectEngine(IndexEngine): """ - Index Engine for use with object-dtype Index, namely the base class Index + Index Engine for use with object-dtype Index, namely the base class Index. """ cdef _make_hash_table(self, Py_ssize_t n): return _hash.PyObjectHashTable(n) @@ -556,8 +565,8 @@ cpdef convert_scalar(ndarray arr, object value): pass elif value is None or value != value: return np.datetime64("NaT", "ns") - raise ValueError("cannot set a Timestamp with a non-timestamp {typ}" - .format(typ=type(value).__name__)) + raise ValueError("cannot set a Timestamp with a non-timestamp " + f"{type(value).__name__}") elif arr.descr.type_num == NPY_TIMEDELTA: if util.is_array(value): @@ -573,17 +582,17 @@ cpdef convert_scalar(ndarray arr, object value): pass elif value is None or value != value: return np.timedelta64("NaT", "ns") - raise ValueError("cannot set a Timedelta with a non-timedelta {typ}" - .format(typ=type(value).__name__)) + raise ValueError("cannot set a Timedelta with a non-timedelta " + f"{type(value).__name__}") if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not issubclass(arr.dtype.type, np.bool_)): if util.is_bool_object(value): - raise ValueError('Cannot assign bool to float/integer series') + raise ValueError("Cannot assign bool to float/integer series") if issubclass(arr.dtype.type, (np.integer, np.bool_)): if util.is_float_object(value) and value != value: - raise ValueError('Cannot assign nan to integer series') + raise ValueError("Cannot assign nan to integer series") return value @@ -621,13 +630,12 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- levels : list-like of numpy arrays - Levels of the MultiIndex + Levels of the MultiIndex. labels : list-like of numpy arrays of integer dtype - Labels of the MultiIndex + Labels of the MultiIndex. offsets : numpy array of uint64 dtype - Pre-calculated offsets, one for each level of the index + Pre-calculated offsets, one for each level of the index. """ - self.levels = levels self.offsets = offsets @@ -660,7 +668,6 @@ cdef class BaseMultiIndexCodesEngine: int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ - level_codes = [lev.get_indexer(codes) + 1 for lev, codes in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) @@ -677,7 +684,7 @@ cdef class BaseMultiIndexCodesEngine: # Index._get_fill_indexer), sort (integer representations of) keys: order = np.argsort(lab_ints) lab_ints = lab_ints[order] - indexer = (getattr(self._base, 'get_{}_indexer'.format(method)) + indexer = (getattr(self._base, f'get_{method}_indexer') (self, lab_ints, limit=limit)) indexer = indexer[order] else: @@ -687,7 +694,7 @@ cdef class BaseMultiIndexCodesEngine: def get_loc(self, object key): if is_definitely_invalid_key(key): - raise TypeError("'{key}' is an invalid key".format(key=key)) + raise TypeError(f"'{key}' is an invalid key") if not isinstance(key, tuple): raise KeyError(key) try: diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 308e914b7b5b7..01f4fb060d982 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -11,14 +11,13 @@ cdef class _NDFrameIndexerBase: self._ndim = None @property - def ndim(self): + def ndim(self) -> int: # Delay `ndim` instantiation until required as reading it # from `obj` isn't entirely cheap. ndim = self._ndim if ndim is None: ndim = self._ndim = self.obj.ndim if ndim > 2: - msg = ("NDFrameIndexer does not support NDFrame objects with" - " ndim > 2") - raise ValueError(msg) + raise ValueError("NDFrameIndexer does not support " + "NDFrame objects with ndim > 2") return ndim diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index ff143fea892ae..8bbbc6db94842 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t -from cpython.object cimport PyObject +from cpython.slice cimport PySlice_GetIndicesEx cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX @@ -9,13 +9,6 @@ cdef extern from "Python.h": import numpy as np from numpy cimport int64_t -cdef extern from "compat_helper.h": - cdef int slice_get_indices(PyObject* s, Py_ssize_t length, - Py_ssize_t *start, Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) except -1 - - from pandas._libs.algos import ensure_int64 @@ -53,7 +46,7 @@ cdef class BlockPlacement: self._as_array = arr self._has_array = True - def __str__(self): + def __str__(self) -> str: cdef: slice s = self._ensure_has_slice() if s is not None: @@ -61,12 +54,12 @@ cdef class BlockPlacement: else: v = self._as_array - return '%s(%r)' % (self.__class__.__name__, v) + return f'{type(self).__name__}({v})' - def __repr__(self): + def __repr__(self) -> str: return str(self) - def __len__(self): + def __len__(self) -> int: cdef: slice s = self._ensure_has_slice() if s is not None: @@ -85,7 +78,7 @@ cdef class BlockPlacement: return iter(self._as_array) @property - def as_slice(self): + def as_slice(self) -> slice: cdef: slice s = self._ensure_has_slice() if s is None: @@ -103,7 +96,7 @@ cdef class BlockPlacement: return self._as_array def isin(self, arr): - from pandas.core.index import Int64Index + from pandas.core.indexes.api import Int64Index return Int64Index(self.as_array, copy=False).isin(arr) @property @@ -118,7 +111,7 @@ cdef class BlockPlacement: return self._as_array @property - def is_slice_like(self): + def is_slice_like(self) -> bool: cdef: slice s = self._ensure_has_slice() return s is not None @@ -250,7 +243,6 @@ cpdef Py_ssize_t slice_len( - if ``s.step < 0``, ``s.start`` is not ``None`` Otherwise, the result is unreliable. - """ cdef: Py_ssize_t start, stop, step, length @@ -258,8 +250,8 @@ cpdef Py_ssize_t slice_len( if slc is None: raise TypeError("slc must be slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return length @@ -270,7 +262,6 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): If `objlen` is not specified, slice must be bounded, otherwise the result will be wrong. - """ cdef: Py_ssize_t start, stop, step, length @@ -278,8 +269,8 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - slice_get_indices(slc, objlen, - &start, &stop, &step, &length) + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) return start, stop, step, length @@ -372,7 +363,6 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): Returns ------- iter : iterator of (int, slice or array) - """ # There's blkno in this function's name because it's used in block & # blockno handling. @@ -441,20 +431,18 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): yield blkno, result -def get_blkno_placements(blknos, group=True): +def get_blkno_placements(blknos, group: bool = True): """ - Parameters ---------- blknos : array of int64 - group : bool + group : bool, default True Returns ------- iterator yield (BlockPlacement, blkno) """ - blknos = ensure_int64(blknos) for blkno, indexer in get_blkno_indexers(blknos, group): diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 1a712d0c4efa8..1166768472449 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -41,8 +41,7 @@ cdef class IntervalMixin: Returns ------- bool - ``True`` if the Interval is closed on the left-side, else - ``False``. + True if the Interval is closed on the left-side. """ return self.closed in ('left', 'both') @@ -56,8 +55,7 @@ cdef class IntervalMixin: Returns ------- bool - ``True`` if the Interval is closed on the left-side, else - ``False``. + True if the Interval is closed on the left-side. """ return self.closed in ('right', 'both') @@ -71,8 +69,7 @@ cdef class IntervalMixin: Returns ------- bool - ``True`` if the Interval is closed on the left-side, else - ``False``. + True if the Interval is closed on the left-side. """ return not self.closed_left @@ -86,15 +83,14 @@ cdef class IntervalMixin: Returns ------- bool - ``True`` if the Interval is closed on the left-side, else - ``False``. + True if the Interval is closed on the left-side. """ return not self.closed_right @property def mid(self): """ - Return the midpoint of the Interval + Return the midpoint of the Interval. """ try: return 0.5 * (self.left + self.right) @@ -104,7 +100,9 @@ cdef class IntervalMixin: @property def length(self): - """Return the length of the Interval""" + """ + Return the length of the Interval. + """ return self.right - self.left @property @@ -177,8 +175,8 @@ cdef class IntervalMixin: When `other` is not closed exactly the same as self. """ if self.closed != other.closed: - msg = "'{}.closed' is '{}', expected '{}'." - raise ValueError(msg.format(name, other.closed, self.closed)) + raise ValueError(f"'{name}.closed' is {repr(other.closed)}, " + f"expected {repr(self.closed)}.") cdef _interval_like(other): @@ -283,15 +281,19 @@ cdef class Interval(IntervalMixin): _typ = "interval" cdef readonly object left - """Left bound for the interval""" + """ + Left bound for the interval. + """ cdef readonly object right - """Right bound for the interval""" + """ + Right bound for the interval. + """ cdef readonly str closed """ Whether the interval is closed on the left-side, right-side, both or - neither + neither. """ def __init__(self, left, right, str closed='right'): @@ -302,17 +304,14 @@ cdef class Interval(IntervalMixin): self._validate_endpoint(right) if closed not in _VALID_CLOSED: - msg = "invalid option for 'closed': {closed}".format(closed=closed) - raise ValueError(msg) + raise ValueError(f"invalid option for 'closed': {closed}") if not left <= right: - raise ValueError('left side of interval must be <= right side') + raise ValueError("left side of interval must be <= right side") if (isinstance(left, Timestamp) and not tz_compare(left.tzinfo, right.tzinfo)): # GH 18538 - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") - raise ValueError(msg.format(left_tz=left.tzinfo, - right_tz=right.tzinfo)) + raise ValueError("left and right must have the same time zone, got " + f"{repr(left.tzinfo)}' and {repr(right.tzinfo)}") self.left = left self.right = right self.closed = closed @@ -321,16 +320,15 @@ cdef class Interval(IntervalMixin): # GH 23013 if not (is_integer_object(endpoint) or is_float_object(endpoint) or isinstance(endpoint, (Timestamp, Timedelta))): - msg = ("Only numeric, Timestamp and Timedelta endpoints " - "are allowed when constructing an Interval.") - raise ValueError(msg) + raise ValueError("Only numeric, Timestamp and Timedelta endpoints " + "are allowed when constructing an Interval.") def __hash__(self): return hash((self.left, self.right, self.closed)) - def __contains__(self, key): + def __contains__(self, key) -> bool: if _interval_like(key): - raise TypeError('__contains__ not defined for two intervals') + raise TypeError("__contains__ not defined for two intervals") return ((self.left < key if self.open_left else self.left <= key) and (key < self.right if self.open_right else key <= self.right)) @@ -353,8 +351,7 @@ cdef class Interval(IntervalMixin): name = type(self).__name__ other = type(other).__name__ op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError('unorderable types: {name}() {op} {other}()' - .format(name=name, op=op_str, other=other)) + raise TypeError(f"unorderable types: {name}() {op_str} {other}()") def __reduce__(self): args = (self.left, self.right, self.closed) @@ -371,21 +368,19 @@ cdef class Interval(IntervalMixin): return left, right - def __repr__(self): + def __repr__(self) -> str: left, right = self._repr_base() name = type(self).__name__ - repr_str = '{name}({left!r}, {right!r}, closed={closed!r})'.format( - name=name, left=left, right=right, closed=self.closed) + repr_str = f'{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})' return repr_str - def __str__(self): + def __str__(self) -> str: left, right = self._repr_base() start_symbol = '[' if self.closed_left else '(' end_symbol = ']' if self.closed_right else ')' - return '{start}{left}, {right}{end}'.format( - start=start_symbol, left=left, right=right, end=end_symbol) + return f'{start_symbol}{left}, {right}{end_symbol}' def __add__(self, y): if isinstance(y, numbers.Number): @@ -435,12 +430,12 @@ cdef class Interval(IntervalMixin): Parameters ---------- other : Interval - The interval to check against for an overlap. + Interval to check against for an overlap. Returns ------- bool - ``True`` if the two intervals overlap, else ``False``. + True if the two intervals overlap. See Also -------- @@ -471,8 +466,8 @@ cdef class Interval(IntervalMixin): False """ if not isinstance(other, Interval): - msg = '`other` must be an Interval, got {other}' - raise TypeError(msg.format(other=type(other).__name__)) + raise TypeError("`other` must be an Interval, " + f"got {type(other).__name__}") # equality is okay if both endpoints are closed (overlap at a point) op1 = le if (self.closed_left and other.closed_right) else lt @@ -492,20 +487,19 @@ def intervals_to_interval_bounds(ndarray intervals, Parameters ---------- intervals : ndarray - object array of Intervals / nulls + Object array of Intervals / nulls. - validate_closed: boolean, default True - boolean indicating if all intervals must be closed on the same side. + validate_closed: bool, default True + Boolean indicating if all intervals must be closed on the same side. Mismatching closed will raise if True, else return None for closed. Returns ------- - tuples (left: ndarray object array, - right: ndarray object array, - closed: str) - + tuple of tuples + left : (ndarray, object, array) + right : (ndarray, object, array) + closed: str """ - cdef: object closed = None, interval int64_t n = len(intervals) @@ -523,8 +517,8 @@ def intervals_to_interval_bounds(ndarray intervals, continue if not isinstance(interval, Interval): - raise TypeError("type {typ} with value {iv} is not an interval" - .format(typ=type(interval), iv=interval)) + raise TypeError(f"type {type(interval)} with value " + f"{interval} is not an interval") left[i] = interval.left right[i] = interval.right @@ -534,8 +528,7 @@ def intervals_to_interval_bounds(ndarray intervals, elif closed != interval.closed: closed = None if validate_closed: - msg = 'intervals must all be closed on the same side' - raise ValueError(msg) + raise ValueError("intervals must all be closed on the same side") return left, right, closed diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 6e3be19f2b73e..d09413bfa5210 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -6,12 +6,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.algos import is_monotonic -ctypedef fused scalar_t: - float64_t - float32_t +ctypedef fused int_scalar_t: int64_t - int32_t + float64_t + +ctypedef fused uint_scalar_t: uint64_t + float64_t + +ctypedef fused scalar_t: + int_scalar_t + uint_scalar_t # ---------------------------------------------------------------------- # IntervalTree @@ -114,43 +119,6 @@ cdef class IntervalTree(IntervalMixin): sort_order = np.lexsort(values) return is_monotonic(sort_order, False)[0] - def get_loc(self, scalar_t key): - """Return all positions corresponding to intervals that overlap with - the given scalar key - """ - result = Int64Vector() - self.root.query(result, key) - if not result.data.n: - raise KeyError(key) - return result.to_array().astype('intp') - - def _get_partial_overlap(self, key_left, key_right, side): - """Return all positions corresponding to intervals with the given side - falling between the left and right bounds of an interval query - """ - if side == 'left': - values = self.left - sorter = self.left_sorter - else: - values = self.right - sorter = self.right_sorter - key = [key_left, key_right] - i, j = values.searchsorted(key, sorter=sorter) - return sorter[i:j] - - def get_loc_interval(self, key_left, key_right): - """Lookup the intervals enclosed in the given interval bounds - - The given interval is presumed to have closed bounds. - """ - import pandas as pd - left_overlap = self._get_partial_overlap(key_left, key_right, 'left') - right_overlap = self._get_partial_overlap(key_left, key_right, 'right') - enclosing = self.get_loc(0.5 * (key_left + key_right)) - combined = np.concatenate([left_overlap, right_overlap, enclosing]) - uniques = pd.unique(combined) - return uniques.astype('intp') - def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. @@ -165,7 +133,12 @@ cdef class IntervalTree(IntervalMixin): result = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) elif result.data.n > old_len + 1: @@ -187,7 +160,12 @@ cdef class IntervalTree(IntervalMixin): missing = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) missing.append(i) @@ -195,7 +173,7 @@ cdef class IntervalTree(IntervalMixin): return (result.to_array().astype('intp'), missing.to_array().astype('intp')) - def __repr__(self): + def __repr__(self) -> str: return (''.format( dtype=self.dtype, closed=self.closed, @@ -231,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset): {{py: nodes = [] -for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: +for dtype in ['float64', 'int64', 'uint64']: for closed, cmp_left, cmp_right in [ ('left', '<=', '<'), ('right', '<', '<='), @@ -239,19 +217,26 @@ for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: ('neither', '<', '<')]: cmp_left_converse = '<' if cmp_left == '<=' else '<=' cmp_right_converse = '<' if cmp_right == '<=' else '<=' + if dtype.startswith('int'): + fused_prefix = 'int_' + elif dtype.startswith('uint'): + fused_prefix = 'uint_' + elif dtype.startswith('float'): + fused_prefix = '' nodes.append((dtype, dtype.title(), closed, closed.title(), cmp_left, cmp_right, cmp_left_converse, - cmp_right_converse)) + cmp_right_converse, + fused_prefix)) }} NODE_CLASSES = {} {{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, - cmp_left_converse, cmp_right_converse in nodes}} + cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree @@ -354,7 +339,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar_t point): + cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ @@ -394,7 +379,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: else: result.extend(self.center_left_indices) - def __repr__(self): + def __repr__(self) -> str: if self.is_leaf_node: return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' '%s elements (terminal)>' % self.n_elements) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 11c56f784d378..093c53790cd35 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -29,13 +29,14 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc > 0 and lc > 0: - count += lc * rc + if rc > 0 and lc > 0: + count += lc * rc # exclude the NA group left_pos = left_count[0] @@ -44,19 +45,20 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] - - if rc > 0 and lc > 0: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) @@ -79,12 +81,13 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - if right_count[i] > 0: - count += left_count[i] * right_count[i] - else: - count += left_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] # exclude the NA group left_pos = left_count[0] @@ -93,24 +96,25 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc == 0: - for j in range(lc): - left_indexer[position + j] = left_pos + j - right_indexer[position + j] = -1 - position += lc - else: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc left_indexer = _get_result_indexer(left_sorter, left_indexer) right_indexer = _get_result_indexer(right_sorter, right_indexer) @@ -149,15 +153,16 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) - # First pass, determine size of result set, do not use the NA group - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] + with nogil: + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] - if rc > 0 and lc > 0: - count += lc * rc - else: - count += lc + rc + if rc > 0 and lc > 0: + count += lc * rc + else: + count += lc + rc # exclude the NA group left_pos = left_count[0] @@ -166,29 +171,30 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, left_indexer = np.empty(count, dtype=np.int64) right_indexer = np.empty(count, dtype=np.int64) - for i in range(1, max_groups + 1): - lc = left_count[i] - rc = right_count[i] - - if rc == 0: - for j in range(lc): - left_indexer[position + j] = left_pos + j - right_indexer[position + j] = -1 - position += lc - elif lc == 0: - for j in range(rc): - left_indexer[position + j] = -1 - right_indexer[position + j] = right_pos + j - position += rc - else: - for j in range(lc): - offset = position + j * rc - for k in range(rc): - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - position += lc * rc - left_pos += lc - right_pos += rc + with nogil: + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + elif lc == 0: + for j in range(rc): + left_indexer[position + j] = -1 + right_indexer[position + j] = right_pos + j + position += rc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) @@ -294,7 +300,7 @@ def left_join_indexer_unique(join_t[:] left, join_t[:] right): @cython.boundscheck(False) def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges + Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count @@ -397,7 +403,7 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @cython.boundscheck(False) def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges + Two-pass algorithm for monotonic indexes. Handles many-to-one merges. """ cdef: Py_ssize_t i, j, k, nright, nleft, count diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index eddc0beae7b8b..719db5c03f07f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -4,7 +4,6 @@ from fractions import Fraction from numbers import Number import sys -import warnings import cython from cython import Py_ssize_t @@ -20,7 +19,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_Check, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, float32_t, float64_t, @@ -58,7 +57,7 @@ from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare from pandas._libs.missing cimport ( - checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period + checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period, C_NA ) @@ -73,7 +72,9 @@ cdef: def values_from_object(obj: object): - """ return my values or the object if we are say an ndarray """ + """ + Return my values or the object if we are say an ndarray. + """ func: object if getattr(obj, '_typ', '') == 'dataframe': @@ -89,8 +90,11 @@ def values_from_object(obj: object): @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(arr: object[:]) -> int64_t: - """ return the memory usage of an object array in bytes, - does not include the actual bytes of the pointers """ + """ + Return the memory usage of an object array in bytes. + + Does not include the actual bytes of the pointers + """ i: Py_ssize_t n: Py_ssize_t size: int64_t @@ -107,8 +111,6 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: def is_scalar(val: object) -> bool: """ - Return True if given value is scalar. - Parameters ---------- val : object @@ -125,12 +127,12 @@ def is_scalar(val: object) -> bool: - Interval - DateOffset - Fraction - - Number + - Number. Returns ------- bool - Return True if given object is scalar, False otherwise + Return True if given object is scalar. Examples -------- @@ -161,6 +163,7 @@ def is_scalar(val: object) -> bool: or PyTime_Check(val) # We differ from numpy, which claims that None is not scalar; # see np.isscalar + or val is C_NA or val is None or isinstance(val, (Fraction, Number)) or util.is_period_object(val) @@ -179,7 +182,7 @@ def item_from_zerodim(val: object) -> object: Returns ------- - result : object + object Examples -------- @@ -191,7 +194,6 @@ def item_from_zerodim(val: object) -> object: 1 >>> item_from_zerodim(np.array([1])) array([1]) - """ if cnp.PyArray_IsZeroDim(val): return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val) @@ -207,13 +209,13 @@ def fast_unique_multiple(list arrays, sort: bool=True): Parameters ---------- list : array-like - A list of array-like objects - sort : boolean - Whether or not to sort the resulting unique list + List of array-like objects. + sort : bool + Whether or not to sort the resulting unique list. Returns ------- - unique_list : list of unique values + list of unique values """ cdef: ndarray[object] buf @@ -278,13 +280,13 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): Parameters ---------- gen : generator object - A generator of lists from which the unique list is created - sort : boolean - Whether or not to sort the resulting unique list + Generator of lists from which the unique list is created. + sort : bool + Whether or not to sort the resulting unique list. Returns ------- - unique_list : list of unique values + list of unique values """ cdef: list buf @@ -337,7 +339,7 @@ def dicts_to_array(dicts: list, columns: list): def fast_zip(list ndarrays): """ - For zipping multiple ndarrays into an ndarray of tuples + For zipping multiple ndarrays into an ndarray of tuples. """ cdef: Py_ssize_t i, j, k, n @@ -366,7 +368,7 @@ def fast_zip(list ndarrays): arr = ndarrays[j] it = PyArray_IterNew(arr) if len(arr) != n: - raise ValueError('all arrays must be same length') + raise ValueError("all arrays must be same length") for i in range(n): val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) @@ -386,9 +388,7 @@ def get_reverse_indexer(const int64_t[:] indexer, Py_ssize_t length): indexer_inv[indexer[x]] = x .. note:: If indexer is not unique, only first occurrence is accounted. - """ - cdef: Py_ssize_t i, n = len(indexer) ndarray[int64_t] rev_indexer @@ -509,8 +509,10 @@ def maybe_booleans_to_slice(ndarray[uint8_t] mask): @cython.wraparound(False) @cython.boundscheck(False) def array_equivalent_object(left: object[:], right: object[:]) -> bool: - """ perform an element by element comparion on 1-d object arrays - taking into account nan positions """ + """ + Perform an element by element comparison on 1-d object arrays + taking into account nan positions. + """ cdef: Py_ssize_t i, n = left.shape[0] object x, y @@ -522,8 +524,11 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: # we are either not equal or both nan # I think None == None will be true here try: - if not (PyObject_RichCompareBool(x, y, Py_EQ) or - (x is None or is_nan(x)) and (y is None or is_nan(y))): + if PyArray_Check(x) and PyArray_Check(y): + if not array_equivalent_object(x, y): + return False + elif not (PyObject_RichCompareBool(x, y, Py_EQ) or + (x is None or is_nan(x)) and (y is None or is_nan(y))): return False except TypeError as err: # Avoid raising TypeError on tzawareness mismatch @@ -573,7 +578,7 @@ def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: Returns ------- - casted_arr : ndarray + ndarray A new array with the input array's elements casted. """ cdef: @@ -596,7 +601,7 @@ def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: @cython.boundscheck(False) def clean_index_list(obj: list): """ - Utility used in pandas.core.index.ensure_index + Utility used in ``pandas.core.indexes.api.ensure_index``. """ cdef: Py_ssize_t i, n = len(obj) @@ -615,7 +620,7 @@ def clean_index_list(obj: list): # don't force numpy coerce with nan's inferred = infer_dtype(obj, skipna=False) - if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: + if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: # TODO: we infer an integer but it *could* be a uint64 @@ -638,7 +643,7 @@ def clean_index_list(obj: list): def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, object closed='left', bint hasnans=0): """ - Int64 (datetime64) version of generic python version in groupby.py + Int64 (datetime64) version of generic python version in ``groupby.py``. """ cdef: Py_ssize_t lenidx, lenbin, i, j, bc, vc @@ -698,10 +703,9 @@ def generate_bins_dt64(ndarray[int64_t] values, const int64_t[:] binner, @cython.boundscheck(False) @cython.wraparound(False) -def get_level_sorter(const int64_t[:] label, - const int64_t[:] starts): +def get_level_sorter(const int64_t[:] label, const int64_t[:] starts): """ - argsort for a single level of a multi-index, keeping the order of higher + Argsort for a single level of a multi-index, keeping the order of higher levels unchanged. `starts` points to starts of same-key indices w.r.t to leading levels; equivalent to: np.hstack([label[starts[i]:starts[i+1]].argsort(kind='mergesort') @@ -829,18 +833,38 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, # core.common import for fast inference checks def is_float(obj: object) -> bool: + """ + Returns + ------- + bool + """ return util.is_float_object(obj) def is_integer(obj: object) -> bool: + """ + Returns + ------- + bool + """ return util.is_integer_object(obj) def is_bool(obj: object) -> bool: + """ + Returns + ------- + bool + """ return util.is_bool_object(obj) def is_complex(obj: object) -> bool: + """ + Returns + ------- + bool + """ return util.is_complex_object(obj) @@ -853,11 +877,17 @@ cpdef bint is_interval(object obj): def is_period(val: object) -> bool: - """ Return a boolean if this is a Period object """ + """ + Return a boolean if this is a Period object. + + Returns + ------- + bool + """ return util.is_period_object(val) -def is_list_like(obj: object, allow_sets: bool = True): +def is_list_like(obj: object, allow_sets: bool = True) -> bool: """ Check if the object is list-like. @@ -868,15 +898,16 @@ def is_list_like(obj: object, allow_sets: bool = True): Parameters ---------- - obj : The object to check - allow_sets : boolean, default True - If this parameter is False, sets will not be considered list-like + obj : object + Object to check. + allow_sets : bool, default True + If this parameter is False, sets will not be considered list-like. .. versionadded:: 0.24.0 Returns ------- - is_list_like : bool + bool Whether `obj` has list-like properties. Examples @@ -971,6 +1002,7 @@ cdef class Seen: bint nat_ # seen nat bint bool_ # seen_bool bint null_ # seen_null + bint nan_ # seen_np.nan bint uint_ # seen_uint (unsigned integer) bint sint_ # seen_sint (signed integer) bint float_ # seen_float @@ -995,6 +1027,7 @@ cdef class Seen: self.nat_ = 0 self.bool_ = 0 self.null_ = 0 + self.nan_ = 0 self.uint_ = 0 self.sint_ = 0 self.float_ = 0 @@ -1025,15 +1058,16 @@ cdef class Seen: Returns ------- - return_values : bool + bool Whether or not we should return the original input array to avoid data truncation. Raises ------ - ValueError : uint64 elements were detected, and at least one of the - two conflict cases was also detected. However, we are - trying to force conversion to a numeric dtype. + ValueError + uint64 elements were detected, and at least one of the + two conflict cases was also detected. However, we are + trying to force conversion to a numeric dtype. """ return (self.uint_ and (self.null_ or self.sint_) and not self.coerce_numeric) @@ -1082,7 +1116,9 @@ cdef class Seen: cdef _try_infer_map(v): - """ if its in our map, just return the dtype """ + """ + If its in our map, just return the dtype. + """ cdef: object attr, val for attr in ['name', 'kind', 'base']: @@ -1092,7 +1128,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(value: object, skipna: object=None) -> str: +def infer_dtype(value: object, skipna: bool = True) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1100,18 +1136,18 @@ def infer_dtype(value: object, skipna: object=None) -> str: Parameters ---------- value : scalar, list, ndarray, or pandas type - skipna : bool, default False + skipna : bool, default True Ignore NaN values when inferring the type. .. versionadded:: 0.21.0 Returns ------- - string describing the common type of the input data. + str + Describing the common type of the input data. Results can include: - string - - unicode - bytes - floating - integer @@ -1132,7 +1168,8 @@ def infer_dtype(value: object, skipna: object=None) -> str: Raises ------ - TypeError if ndarray-like but cannot infer the dtype + TypeError + If ndarray-like but cannot infer the dtype Notes ----- @@ -1198,12 +1235,6 @@ def infer_dtype(value: object, skipna: object=None) -> str: bint seen_pdnat = False bint seen_val = False - if skipna is None: - msg = ('A future version of pandas will default to `skipna=True`. To ' - 'silence this warning, pass `skipna=True|False` explicitly.') - warnings.warn(msg, FutureWarning, stacklevel=2) - skipna = False - if util.is_array(value): values = value elif hasattr(value, 'dtype'): @@ -1219,8 +1250,7 @@ def infer_dtype(value: object, skipna: object=None) -> str: return value # its ndarray like but we can't handle - raise ValueError("cannot infer type for {typ}" - .format(typ=type(value))) + raise ValueError(f"cannot infer type for {type(value)}") else: if not isinstance(value, list): @@ -1232,9 +1262,6 @@ def infer_dtype(value: object, skipna: object=None) -> str: # make contiguous values = values.ravel() - if skipna: - values = values[~isnaobj(values)] - val = _try_infer_map(values) if val is not None: return val @@ -1242,6 +1269,9 @@ def infer_dtype(value: object, skipna: object=None) -> str: if values.dtype != np.object_: values = values.astype('O') + if skipna: + values = values[~isnaobj(values)] + n = len(values) if n == 0: return 'empty' @@ -1262,90 +1292,90 @@ def infer_dtype(value: object, skipna: object=None) -> str: # if all values are nan/NaT if seen_val is False and seen_pdnat is True: - return 'datetime' + return "datetime" # float/object nan is handled in latter logic if util.is_datetime64_object(val): if is_datetime64_array(values): - return 'datetime64' + return "datetime64" elif is_timedelta(val): if is_timedelta_or_timedelta64_array(values): - return 'timedelta' + return "timedelta" elif util.is_integer_object(val): # ordering matters here; this check must come after the is_timedelta # check otherwise numpy timedelta64 objects would come through here if is_integer_array(values): - return 'integer' + return "integer" elif is_integer_float_array(values): if is_integer_na_array(values): - return 'integer-na' + return "integer-na" else: - return 'mixed-integer-float' - return 'mixed-integer' + return "mixed-integer-float" + return "mixed-integer" elif PyDateTime_Check(val): if is_datetime_array(values): - return 'datetime' + return "datetime" elif PyDate_Check(val): if is_date_array(values, skipna=skipna): - return 'date' + return "date" elif PyTime_Check(val): if is_time_array(values, skipna=skipna): - return 'time' + return "time" elif is_decimal(val): - return 'decimal' + return "decimal" elif is_complex(val): - return 'complex' + return "complex" elif util.is_float_object(val): if is_float_array(values): - return 'floating' + return "floating" elif is_integer_float_array(values): if is_integer_na_array(values): - return 'integer-na' + return "integer-na" else: - return 'mixed-integer-float' + return "mixed-integer-float" elif util.is_bool_object(val): if is_bool_array(values, skipna=skipna): - return 'boolean' + return "boolean" elif isinstance(val, str): if is_string_array(values, skipna=skipna): - return 'string' + return "string" elif isinstance(val, bytes): if is_bytes_array(values, skipna=skipna): - return 'bytes' + return "bytes" elif util.is_period_object(val): if is_period_array(values): - return 'period' + return "period" elif is_interval(val): if is_interval_array(values): - return 'interval' + return "interval" for i in range(n): val = values[i] if (util.is_integer_object(val) and not util.is_timedelta64_object(val) and not util.is_datetime64_object(val)): - return 'mixed-integer' + return "mixed-integer" - return 'mixed' + return "mixed" def infer_datetimelike_array(arr: object) -> object: """ - infer if we have a datetime or timedelta array + Infer if we have a datetime or timedelta array. - date: we have *only* date and maybe strings, nulls - datetime: we have *only* datetimes and maybe strings, nulls - timedelta: we have *only* timedeltas and maybe strings, nulls @@ -1360,9 +1390,8 @@ def infer_datetimelike_array(arr: object) -> object: Returns ------- - string: {datetime, timedelta, date, nat, mixed} + str: {datetime, timedelta, date, nat, mixed} """ - cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = 0, seen_date = 0, seen_datetime = 0 @@ -1405,16 +1434,16 @@ def infer_datetimelike_array(arr: object) -> object: # timedelta, or timedelta64 seen_timedelta = 1 else: - return 'mixed' + return "mixed" if seen_date and not (seen_datetime or seen_timedelta): - return 'date' + return "date" elif seen_datetime and not seen_timedelta: - return 'datetime' + return "datetime" elif seen_timedelta and not seen_datetime: - return 'timedelta' + return "timedelta" elif seen_nat: - return 'nat' + return "nat" # short-circuit by trying to # actually convert these strings @@ -1422,8 +1451,8 @@ def infer_datetimelike_array(arr: object) -> object: # convert *every* string array if len(objs): try: - array_to_datetime(objs, errors='raise') - return 'datetime' + array_to_datetime(objs, errors="raise") + return "datetime" except (ValueError, TypeError): pass @@ -1497,12 +1526,11 @@ cdef class Validator: return self.is_valid(value) or self.is_valid_null(value) cdef bint is_value_typed(self, object value) except -1: - raise NotImplementedError( - '{typ} child class must define is_value_typed' - .format(typ=type(self).__name__)) + raise NotImplementedError(f"{type(self).__name__} child class " + "must define is_value_typed") cdef bint is_valid_null(self, object value) except -1: - return value is None or util.is_nan(value) + return value is None or value is C_NA or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False @@ -1635,9 +1663,8 @@ cdef class TemporalValidator(Validator): return self.is_value_typed(value) or self.is_valid_null(value) cdef bint is_valid_null(self, object value) except -1: - raise NotImplementedError( - '{typ} child class must define is_valid_null' - .format(typ=type(self).__name__)) + raise NotImplementedError(f"{type(self).__name__} child class " + "must define is_valid_null") cdef inline bint is_valid_skipna(self, object value) except -1: cdef: @@ -1677,6 +1704,7 @@ cpdef bint is_datetime64_array(ndarray values): return validator.validate(values) +# TODO: only non-here use is in test def is_datetime_with_singletz_array(values: ndarray) -> bool: """ Check values have the same tzinfo attribute. @@ -1720,8 +1748,11 @@ cdef class AnyTimedeltaValidator(TimedeltaValidator): return is_timedelta(value) +# TODO: only non-here use is in test cpdef bint is_timedelta_or_timedelta64_array(ndarray values): - """ infer with timedeltas and/or nat/none """ + """ + Infer with timedeltas and/or nat/none. + """ cdef: AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), skipna=True) @@ -1806,9 +1837,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, Returns ------- - numeric_array : array of converted object values to numerical ones + Array of converted object values to numerical ones. """ - if len(values) == 0: return np.array([], dtype='i8') @@ -1878,7 +1908,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, seen.saw_null() floats[i] = complexes[i] = NaN else: - raise ValueError('Empty string encountered') + raise ValueError("Empty string encountered") elif util.is_complex_object(val): complexes[i] = val seen.complex_ = True @@ -1922,10 +1952,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) else: seen.float_ = True - except (TypeError, ValueError) as e: + except (TypeError, ValueError) as err: if not seen.coerce_numeric: - raise type(e)(str(e) + " at position {pos}".format(pos=i)) - elif "uint64" in str(e): # Exception from check functions. + raise type(err)(f"{err} at position {i}") + elif "uint64" in str(err): # Exception from check functions. raise seen.saw_null() @@ -1954,9 +1984,34 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, @cython.wraparound(False) def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint safe=0, bint convert_datetime=0, - bint convert_timedelta=0): + bint convert_timedelta=0, + bint convert_to_nullable_integer=0): """ Type inference function-- convert object array to proper dtype + + Parameters + ---------- + values : ndarray + Array of object elements to convert. + try_float : bool, default False + If an array-like object contains only float or NaN values is + encountered, whether to convert and return an array of float dtype. + safe : bool, default False + Whether to upcast numeric type (e.g. int cast to float). If set to + True, no upcasting will be performed. + convert_datetime : bool, default False + If an array-like object contains only datetime values or NaT is + encountered, whether to convert and return an array of M8[ns] dtype. + convert_timedelta : bool, default False + If an array-like object contains only timedelta values or NaT is + encountered, whether to convert and return an array of m8[ns] dtype. + convert_to_nullable_integer : bool, default False + If an array-like object contains only interger values (and NaN) is + encountered, whether to convert and return an IntegerArray. + + Returns + ------- + Array of converted object values to more specific dtypes if applicable. """ cdef: Py_ssize_t i, n @@ -1978,6 +2033,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ints = np.empty(n, dtype='i8') uints = np.empty(n, dtype='u8') bools = np.empty(n, dtype=np.uint8) + mask = np.full(n, False) if convert_datetime: datetimes = np.empty(n, dtype='M8[ns]') @@ -1995,6 +2051,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen.null_ = 1 floats[i] = complexes[i] = fnan + mask[i] = True elif val is NaT: seen.nat_ = 1 if convert_datetime: @@ -2004,6 +2061,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not (convert_datetime or convert_timedelta): seen.object_ = 1 break + elif val is np.nan: + seen.nan_ = 1 + mask[i] = True + floats[i] = complexes[i] = val elif util.is_bool_object(val): seen.bool_ = 1 bools[i] = val @@ -2085,11 +2146,19 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen.object_: if not safe: - if seen.null_: + if seen.null_ or seen.nan_: if seen.is_float_or_complex: if seen.complex_: return complexes - elif seen.float_ or seen.int_: + elif seen.float_: + return floats + elif seen.int_: + if convert_to_nullable_integer: + from pandas.core.arrays import IntegerArray + return IntegerArray(ints, mask) + else: + return floats + elif seen.nan_: return floats else: if not seen.bool_: @@ -2128,7 +2197,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats else: @@ -2152,7 +2221,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if seen.complex_: if not seen.int_: return complexes - elif seen.float_: + elif seen.float_ or seen.nan_: if not seen.int_: return floats elif seen.int_: @@ -2166,31 +2235,48 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +# Note: no_default is exported to the public API in pandas.api.extensions +no_default = object() #: Sentinel indicating the default value. + + @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1): +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, + object na_value=no_default, object dtype=object): """ - Substitute for np.vectorize with pandas-friendly dtype inference + Substitute for np.vectorize with pandas-friendly dtype inference. Parameters ---------- arr : ndarray f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used + dtype : numpy.dtype + The numpy dtype to use for the result ndarray. Returns ------- - mapped : ndarray + ndarray """ cdef: Py_ssize_t i, n - ndarray[object] result + ndarray result object val n = len(arr) - result = np.empty(n, dtype=object) + result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - val = arr[i] + if na_value is no_default: + val = arr[i] + else: + val = na_value else: val = f(arr[i]) @@ -2215,7 +2301,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) @cython.wraparound(False) def map_infer(ndarray arr, object f, bint convert=1): """ - Substitute for np.vectorize with pandas-friendly dtype inference + Substitute for np.vectorize with pandas-friendly dtype inference. Parameters ---------- @@ -2224,7 +2310,7 @@ def map_infer(ndarray arr, object f, bint convert=1): Returns ------- - mapped : ndarray + ndarray """ cdef: Py_ssize_t i, n @@ -2260,16 +2346,16 @@ def to_object_array(rows: object, int min_width=0): Parameters ---------- rows : 2-d array (N, K) - A list of lists to be converted into an array + List of lists to be converted into an array. min_width : int - The minimum width of the object array. If a list + Minimum width of the object array. If a list in `rows` contains fewer than `width` elements, the remaining elements in the corresponding row will all be `NaN`. Returns ------- - obj_array : numpy array of the object dtype + numpy array of the object dtype. """ cdef: Py_ssize_t i, j, n, k, tmp @@ -2321,11 +2407,11 @@ def to_object_array_tuples(rows: object): Parameters ---------- rows : 2-d array (N, K) - A list of tuples to be converted into an array. + List of tuples to be converted into an array. Returns ------- - obj_array : numpy array of the object dtype + numpy array of the object dtype. """ cdef: Py_ssize_t i, j, n, k, tmp diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index d0dd306680ae8..d4303ac28b9a5 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -9,3 +9,8 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) cdef bint is_null_period(v) + +cdef class C_NAType: + pass + +cdef C_NAType C_NA diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 052b081988c9e..26653438356b1 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,6 +1,8 @@ import cython from cython import Py_ssize_t +import numbers + import numpy as np cimport numpy as cnp from numpy cimport ndarray, int64_t, uint8_t, float64_t @@ -12,6 +14,9 @@ from pandas._libs.tslibs.np_datetime cimport ( get_timedelta64_value, get_datetime64_value) from pandas._libs.tslibs.nattype cimport ( checknull_with_nat, c_NaT as NaT, is_null_datetimelike) +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op + +from pandas.compat import is_platform_32bit cdef: @@ -20,6 +25,8 @@ cdef: int64_t NPY_NAT = util.get_nat() + bint is_32bit = is_platform_32bit() + cpdef bint checknull(object val): """ @@ -44,7 +51,7 @@ cpdef bint checknull(object val): The difference between `checknull` and `checknull_old` is that `checknull` does *not* consider INF or NEGINF to be NA. """ - return is_null_datetimelike(val, inat_is_null=False) + return val is C_NA or is_null_datetimelike(val, inat_is_null=False) cpdef bint checknull_old(object val): @@ -121,7 +128,7 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj_old(ndarray arr): +def isnaobj_old(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 1-D array are na-like, defined as being any of: @@ -156,7 +163,7 @@ def isnaobj_old(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj2d(ndarray arr): +def isnaobj2d(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 2-D array are na-like, according to the criteria defined in `checknull`: @@ -198,7 +205,7 @@ def isnaobj2d(ndarray arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj2d_old(ndarray arr): +def isnaobj2d_old(arr: ndarray) -> ndarray: """ Return boolean mask denoting which elements of a 2-D array are na-like, according to the criteria defined in `checknull_old`: @@ -278,3 +285,201 @@ cdef inline bint is_null_period(v): # determine if we have a null for a Period (or integer versions), # excluding np.datetime64('nat') and np.timedelta64('nat') return checknull_with_nat(v) + + +# ----------------------------------------------------------------------------- +# Implementation of NA singleton + + +def _create_binary_propagating_op(name, is_divmod=False): + + def method(self, other): + if (other is C_NA or isinstance(other, str) + or isinstance(other, (numbers.Number, np.bool_)) + or isinstance(other, np.ndarray) and not other.shape): + # Need the other.shape clause to handle NumPy scalars, + # since we do a setitem on `out` below, which + # won't work for NumPy scalars. + if is_divmod: + return NA, NA + else: + return NA + + elif isinstance(other, np.ndarray): + out = np.empty(other.shape, dtype=object) + out[:] = NA + + if is_divmod: + return out, out.copy() + else: + return out + + return NotImplemented + + method.__name__ = name + return method + + +def _create_unary_propagating_op(name): + def method(self): + return NA + + method.__name__ = name + return method + + +cdef class C_NAType: + pass + + +class NAType(C_NAType): + """ + NA ("not available") missing value indicator. + + .. warning:: + + Experimental: the behaviour of NA can still change without warning. + + .. versionadded:: 1.0.0 + + The NA singleton is a missing value indicator defined by pandas. It is + used in certain new extension dtypes (currently the "string" dtype). + """ + + _instance = None + + def __new__(cls, *args, **kwargs): + if NAType._instance is None: + NAType._instance = C_NAType.__new__(cls, *args, **kwargs) + return NAType._instance + + def __repr__(self) -> str: + return "" + + def __bool__(self): + raise TypeError("boolean value of NA is ambiguous") + + def __hash__(self): + # GH 30013: Ensure hash is large enough to avoid hash collisions with integers + exponent = 31 if is_32bit else 61 + return 2 ** exponent - 1 + + # Binary arithmetic and comparison ops -> propagate + + __add__ = _create_binary_propagating_op("__add__") + __radd__ = _create_binary_propagating_op("__radd__") + __sub__ = _create_binary_propagating_op("__sub__") + __rsub__ = _create_binary_propagating_op("__rsub__") + __mul__ = _create_binary_propagating_op("__mul__") + __rmul__ = _create_binary_propagating_op("__rmul__") + __matmul__ = _create_binary_propagating_op("__matmul__") + __rmatmul__ = _create_binary_propagating_op("__rmatmul__") + __truediv__ = _create_binary_propagating_op("__truediv__") + __rtruediv__ = _create_binary_propagating_op("__rtruediv__") + __floordiv__ = _create_binary_propagating_op("__floordiv__") + __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") + __mod__ = _create_binary_propagating_op("__mod__") + __rmod__ = _create_binary_propagating_op("__rmod__") + __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True) + __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True) + # __lshift__ and __rshift__ are not implemented + + __eq__ = _create_binary_propagating_op("__eq__") + __ne__ = _create_binary_propagating_op("__ne__") + __le__ = _create_binary_propagating_op("__le__") + __lt__ = _create_binary_propagating_op("__lt__") + __gt__ = _create_binary_propagating_op("__gt__") + __ge__ = _create_binary_propagating_op("__ge__") + + # Unary ops + + __neg__ = _create_unary_propagating_op("__neg__") + __pos__ = _create_unary_propagating_op("__pos__") + __abs__ = _create_unary_propagating_op("__abs__") + __invert__ = _create_unary_propagating_op("__invert__") + + # pow has special + def __pow__(self, other): + if other is C_NA: + return NA + elif isinstance(other, (numbers.Number, np.bool_)): + if other == 0: + # returning positive is correct for +/- 0. + return type(other)(1) + else: + return NA + elif isinstance(other, np.ndarray): + return np.where(other == 0, other.dtype.type(1), NA) + + return NotImplemented + + def __rpow__(self, other): + if other is C_NA: + return NA + elif isinstance(other, (numbers.Number, np.bool_)): + if other == 1 or other == -1: + return other + else: + return NA + elif isinstance(other, np.ndarray): + return np.where((other == 1) | (other == -1), other, NA) + + return NotImplemented + + # Logical ops using Kleene logic + + def __and__(self, other): + if other is False: + return False + elif other is True or other is C_NA: + return NA + else: + return NotImplemented + + __rand__ = __and__ + + def __or__(self, other): + if other is True: + return True + elif other is False or other is C_NA: + return NA + else: + return NotImplemented + + __ror__ = __or__ + + def __xor__(self, other): + if other is False or other is True or other is C_NA: + return NA + return NotImplemented + + __rxor__ = __xor__ + + __array_priority__ = 1000 + _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + types = self._HANDLED_TYPES + (NAType,) + for x in inputs: + if not isinstance(x, types): + return NotImplemented + + if method != "__call__": + raise ValueError(f"ufunc method '{method}' not supported for NA") + result = maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is NotImplemented: + # For a NumPy ufunc that's not a binop, like np.logaddexp + index = [i for i, x in enumerate(inputs) if x is NA][0] + result = np.broadcast_arrays(*inputs)[index] + if result.ndim == 0: + result = result.item() + if ufunc.nout > 1: + result = (NA,) * ufunc.nout + + return result + + +C_NA = NAType() # C-visible +NA = C_NA # Python-visible diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index bdafcd646a4c8..abe1484e3763d 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -123,8 +123,7 @@ def vec_compare(object[:] left, object[:] right, object op): int flag if n != len(right): - raise ValueError('Arrays were different lengths: {n} vs {nright}' - .format(n=n, nright=len(right))) + raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}') if op is operator.lt: flag = Py_LT @@ -224,8 +223,7 @@ def vec_binop(object[:] left, object[:] right, object op): object[:] result if n != len(right): - raise ValueError('Arrays were different lengths: {n} vs {nright}' - .format(n=n, nright=len(right))) + raise ValueError(f'Arrays were different lengths: {n} vs {len(right)}') result = np.empty(n, dtype=object) diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx new file mode 100644 index 0000000000000..f6ecef2038cf3 --- /dev/null +++ b/pandas/_libs/ops_dispatch.pyx @@ -0,0 +1,94 @@ +DISPATCHED_UFUNCS = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + "or", + "xor", + "and", +} +UFUNC_ALIASES = { + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", + "bitwise_or": "or", + "bitwise_and": "and", + "bitwise_xor": "xor", +} + +# For op(., Array) -> Array.__r{op}__ +REVERSED_NAMES = { + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", +} + + +def maybe_dispatch_ufunc_to_dunder_op( + object self, object ufunc, str method, *inputs, **kwargs +): + """ + Dispatch a ufunc to the equivalent dunder method. + + Parameters + ---------- + self : ArrayLike + The array whose dunder method we dispatch to + ufunc : Callable + A NumPy ufunc + method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} + inputs : ArrayLike + The input arrays. + kwargs : Any + The additional keyword arguments, e.g. ``out``. + + Returns + ------- + result : Any + The result of applying the ufunc + """ + # special has the ufuncs we dispatch to the dunder op on + + op_name = ufunc.__name__ + op_name = UFUNC_ALIASES.get(op_name, op_name) + + def not_implemented(*args, **kwargs): + return NotImplemented + + if (method == "__call__" + and op_name in DISPATCHED_UFUNCS + and kwargs.get("out") is None): + if isinstance(inputs[0], type(self)): + name = f"__{op_name}__" + return getattr(self, name, not_implemented)(inputs[1]) + else: + name = REVERSED_NAMES.get(op_name, f"__r{op_name}__") + result = getattr(self, name, not_implemented)(inputs[0]) + return result + else: + return NotImplemented diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3f12ec4c15fc7..377d49f2bbd29 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,6 +2,7 @@ # See LICENSE for the license import bz2 import gzip +import io import os import sys import time @@ -63,11 +64,6 @@ from pandas.errors import (ParserError, DtypeWarning, lzma = _import_lzma() -# Import CParserError as alias of ParserError for backwards compatibility. -# Ultimately, we want to remove this import. See gh-12665 and gh-14479. -CParserError = ParserError - - cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -176,12 +172,9 @@ cdef extern from "parser/tokenizer.h": int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL - float64_t (*double_converter_nogil)(const char *, char **, - char, char, char, - int, int *, int *) nogil - float64_t (*double_converter_withgil)(const char *, char **, - char, char, char, - int, int *, int *) + float64_t (*double_converter)(const char *, char **, + char, char, char, + int, int *, int *) nogil # error handling char *warn_msg @@ -278,7 +271,7 @@ cdef class TextReader: object true_values, false_values object handle bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns - int64_t parser_start + uint64_t parser_start list clocks char *c_encoding kh_str_starts_t *false_set @@ -474,16 +467,11 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 - # - # Our current roundtrip implementation requires the GIL. - self.parser.double_converter_nogil = NULL - self.parser.double_converter_withgil = round_trip + self.parser.double_converter = round_trip elif float_precision == "high": - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = precise_xstrtod + self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = xstrtod + self.parser.double_converter = xstrtod if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) @@ -589,8 +577,7 @@ cdef class TextReader: if not isinstance(quote_char, (str, bytes)) and quote_char is not None: dtype = type(quote_char).__name__ - raise TypeError('"quotechar" must be string, ' - 'not {dtype}'.format(dtype=dtype)) + raise TypeError(f'"quotechar" must be string, not {dtype}') if quote_char is None or quote_char == '': if quoting != QUOTE_NONE: @@ -637,25 +624,24 @@ cdef class TextReader: source = zip_file.open(file_name) elif len(zip_names) == 0: - raise ValueError('Zero files found in compressed ' - 'zip file %s', source) + raise ValueError(f'Zero files found in compressed ' + f'zip file {source}') else: - raise ValueError('Multiple files found in compressed ' - 'zip file %s', str(zip_names)) + raise ValueError(f'Multiple files found in compressed ' + f'zip file {zip_names}') elif self.compression == 'xz': if isinstance(source, str): source = _get_lzma_file(lzma)(source, 'rb') else: source = _get_lzma_file(lzma)(filename=source) else: - raise ValueError('Unrecognized compression type: %s' % - self.compression) - - if b'utf-16' in (self.encoding or b''): - # we need to read utf-16 through UTF8Recoder. - # if source is utf-16, convert source to utf-8 by UTF8Recoder. - source = icom.UTF8Recoder(source, - self.encoding.decode('utf-8')) + raise ValueError(f'Unrecognized compression type: ' + f'{self.compression}') + + if self.encoding and isinstance(source, io.BufferedIOBase): + source = io.TextIOWrapper( + source, self.encoding.decode('utf-8'), newline='') + self.encoding = b'utf-8' self.c_encoding = self.encoding @@ -663,7 +649,7 @@ cdef class TextReader: if isinstance(source, str): encoding = sys.getfilesystemencoding() or "utf-8" - + usource = source source = source.encode(encoding) if self.memory_map: @@ -683,10 +669,11 @@ cdef class TextReader: if ptr == NULL: if not os.path.exists(source): + raise FileNotFoundError( ENOENT, - 'File {source} does not exist'.format(source=source), - source) + f'File {usource} does not exist', + usource) raise IOError('Initializing from file failed') self.parser.source = ptr @@ -703,18 +690,18 @@ cdef class TextReader: self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source else: - raise IOError('Expected file path name or file-like object,' - ' got %s type' % type(source)) + raise IOError(f'Expected file path name or file-like object, ' + f'got {type(source)} type') cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] cdef: - Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa + Py_ssize_t i, start, field_count, passed_count, unnamed_count char *word object name, old_name int status - int64_t hr, data_line + uint64_t hr, data_line char *errors = "strict" StringPath path = _string_path(self.c_encoding) @@ -741,11 +728,11 @@ cdef class TextReader: self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): - msg = "[%s], len of %d," % ( - ','.join(str(m) for m in msg), len(msg)) + joined = ','.join(str(m) for m in msg) + msg = f"[{joined}], len of {len(msg)}," raise ParserError( - 'Passed header=%s but only %d lines in file' - % (msg, self.parser.lines)) + f'Passed header={msg} but only ' + f'{self.parser.lines} lines in file') else: field_count = self.parser.line_fields[hr] @@ -768,10 +755,9 @@ cdef class TextReader: if name == '': if self.has_mi_columns: - name = ('Unnamed: {i}_level_{lvl}' - .format(i=i, lvl=level)) + name = f'Unnamed: {i}_level_{level}' else: - name = 'Unnamed: {i}'.format(i=i) + name = f'Unnamed: {i}' unnamed_count += 1 count = counts.get(name, 0) @@ -779,7 +765,7 @@ cdef class TextReader: if not self.has_mi_columns and self.mangle_dupe_cols: while count > 0: counts[name] = count + 1 - name = '%s.%d' % (name, count) + name = f'{name}.{count}' count = counts.get(name, 0) if old_name == '': @@ -845,11 +831,6 @@ cdef class TextReader: passed_count = len(header[0]) - # if passed_count > field_count: - # raise ParserError('Column names have %d fields, ' - # 'data has %d fields' - # % (passed_count, field_count)) - if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) @@ -990,7 +971,7 @@ cdef class TextReader: cdef _end_clock(self, what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) - print('%s took: %.2f ms' % (what, elapsed * 1000)) + print(f'{what} took: {elapsed * 1000:.2f} ms') def set_noconvert(self, i): self.noconvert.add(i) @@ -1015,22 +996,22 @@ cdef class TextReader: else: end = min(start + rows, self.parser.lines) + # FIXME: dont leave commented-out # # skip footer # if footer > 0: # end -= footer num_cols = -1 - for i in range(self.parser.lines): + # Py_ssize_t cast prevents build warning + for i in range(self.parser.lines): num_cols = (num_cols < self.parser.line_fields[i]) * \ self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise ParserError( - "Too many columns specified: expected {expected} and " - "found {found}" - .format(expected=self.table_width - self.leading_cols, - found=num_cols)) + raise ParserError(f"Too many columns specified: expected " + f"{self.table_width - self.leading_cols} " + f"and found {num_cols}") results = {} nused = 0 @@ -1073,9 +1054,9 @@ cdef class TextReader: if conv: if col_dtype is not None: - warnings.warn(("Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used").format(name), ParserWarning, + warnings.warn((f"Both a converter and dtype were specified " + f"for column {name} - only the converter will " + f"be used"), ParserWarning, stacklevel=5) results[i] = _apply_converter(conv, self.parser, i, start, end, self.c_encoding) @@ -1116,7 +1097,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if col_res is None: - raise ParserError('Unable to parse column {i}'.format(i=i)) + raise ParserError(f'Unable to parse column {i}') results[i] = col_res @@ -1176,12 +1157,9 @@ cdef class TextReader: col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): raise ValueError( - "cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format( - col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in " + f"column {i}") return col_res, na_count @@ -1214,9 +1192,9 @@ cdef class TextReader: dtype=dtype) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type)) + f"Extension Array: {array_type} must implement " + f"_from_sequence_of_strings in order " + f"to be used in parser methods") return result, na_count @@ -1226,8 +1204,7 @@ cdef class TextReader: end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) @@ -1251,8 +1228,7 @@ cdef class TextReader: self.true_set, self.false_set) if user_dtype and na_count is not None: if na_count > 0: - raise ValueError("Bool column has NA values in " - "column {column}".format(column=i)) + raise ValueError(f"Bool column has NA values in column {i}") return result, na_count elif dtype.kind == 'S': @@ -1268,8 +1244,7 @@ cdef class TextReader: elif dtype.kind == 'U': width = dtype.itemsize if width > 0: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") # unicode variable width return self._string_convert(i, start, end, na_filter, @@ -1278,12 +1253,11 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): - raise TypeError("the dtype {dtype} is not supported " - "for parsing, pass this column " - "using parse_dates instead".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported " + f"for parsing, pass this column " + f"using parse_dates instead") else: - raise TypeError("the dtype {dtype} is not " - "supported for parsing".format(dtype=dtype)) + raise TypeError(f"the dtype {dtype} is not supported for parsing") cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): @@ -1385,7 +1359,27 @@ def _ensure_encoded(list lst): # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = _ensure_encoded(list(icom._NA_VALUES)) +STR_NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "n/a", + "NA", + "", + "#NA", + "NULL", + "null", + "NaN", + "-NaN", + "nan", + "-nan", + "", +} +_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) def _maybe_upcast(arr): @@ -1420,59 +1414,6 @@ cdef inline StringPath _string_path(char *encoding): # Type conversions / inference support code -cdef _string_box_factorize(parser_t *parser, int64_t col, - int64_t line_start, int64_t line_end, - bint na_filter, kh_str_starts_t *na_hashset): - cdef: - int error, na_count = 0 - Py_ssize_t i, lines - coliter_t it - const char *word = NULL - ndarray[object] result - - int ret = 0 - kh_strbox_t *table - - object pyval - - object NA = na_values[np.object_] - khiter_t k - - table = kh_init_strbox() - lines = line_end - line_start - result = np.empty(lines, dtype=np.object_) - coliter_setup(&it, parser, col, line_start) - - for i in range(lines): - COLITER_NEXT(it, word) - - if na_filter: - if kh_get_str_starts_item(na_hashset, word): - # in the hash table - na_count += 1 - result[i] = NA - continue - - k = kh_get_strbox(table, word) - - # in the hash table - if k != table.n_buckets: - # this increments the refcount, but need to test - pyval = table.vals[k] - else: - # box it. new ref? - pyval = PyBytes_FromString(word) - - k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval - - result[i] = pyval - - kh_destroy_strbox(table) - - return result, na_count - - cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): @@ -1660,7 +1601,7 @@ cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, char *data ndarray result - result = np.empty(line_end - line_start, dtype='|S%d' % width) + result = np.empty(line_end - line_start, dtype=f'|S{width}') data = result.data with nogil: @@ -1715,22 +1656,12 @@ cdef _try_double(parser_t *parser, int64_t col, result = np.empty(lines, dtype=np.float64) data = result.data na_fset = kset_float64_from_list(na_flist) - if parser.double_converter_nogil != NULL: # if it can run without the GIL - with nogil: - error = _try_double_nogil(parser, parser.double_converter_nogil, - col, line_start, line_end, - na_filter, na_hashset, use_na_flist, - na_fset, NA, data, &na_count) - else: - assert parser.double_converter_withgil != NULL - error = _try_double_nogil(parser, - parser.double_converter_withgil, + with nogil: + error = _try_double_nogil(parser, parser.double_converter, col, line_start, line_end, na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + kh_destroy_float64(na_fset) if error != 0: return None, None @@ -2130,7 +2061,7 @@ cdef raise_parser_error(object base, parser_t *parser): Py_XDECREF(type) raise old_exc - message = '{base}. C error: '.format(base=base) + message = f'{base}. C error: ' if parser.error_msg != NULL: message += parser.error_msg.decode('utf-8') else: @@ -2174,8 +2105,8 @@ def _concatenate_chunks(list chunks): if warning_columns: warning_names = ','.join(warning_columns) warning_message = " ".join([ - "Columns (%s) have mixed types." % warning_names, - "Specify dtype option on import or set low_memory=False." + f"Columns ({warning_names}) have mixed types." + f"Specify dtype option on import or set low_memory=False." ]) warnings.warn(warning_message, DtypeWarning, stacklevel=8) return result diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f505c0479e944..8571761f77265 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,3 +1,4 @@ +from copy import copy from distutils.version import LooseVersion from cython import Py_ssize_t @@ -15,26 +16,16 @@ from numpy cimport (ndarray, cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.lib import maybe_convert_objects +from pandas._libs.lib import maybe_convert_objects, is_scalar -cdef _get_result_array(object obj, Py_ssize_t size, Py_ssize_t cnt): +cdef _check_result_array(object obj, Py_ssize_t cnt): if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): raise ValueError('Function does not reduce') - return np.empty(size, dtype='O') - - -cdef bint _is_sparse_array(object obj): - # TODO can be removed one SparseArray.values is removed (GH26421) - if hasattr(obj, '_subtyp'): - if obj._subtyp == 'sparse_array': - return True - return False - cdef class Reducer: """ @@ -43,11 +34,11 @@ cdef class Reducer: """ cdef: Py_ssize_t increment, chunksize, nresults - object arr, dummy, f, labels, typ, ityp, index + object dummy, f, labels, typ, ityp, index + ndarray arr - def __init__(self, object arr, object f, axis=1, dummy=None, - labels=None): - n, k = arr.shape + def __init__(self, ndarray arr, object f, axis=1, dummy=None, labels=None): + n, k = (arr).shape if axis == 0: if not arr.flags.f_contiguous: @@ -70,8 +61,9 @@ cdef class Reducer: self.dummy, self.typ, self.index, self.ityp = self._check_dummy( dummy=dummy) - def _check_dummy(self, dummy=None): - cdef object index=None, typ=None, ityp=None + cdef _check_dummy(self, dummy=None): + cdef: + object index = None, typ = None, ityp = None if dummy is None: dummy = np.empty(self.chunksize, dtype=self.arr.dtype) @@ -82,18 +74,15 @@ cdef class Reducer: else: - # we passed a series-like - if hasattr(dummy, 'values'): - - typ = type(dummy) - index = getattr(dummy, 'index', None) - dummy = dummy.values + # we passed a Series + typ = type(dummy) + index = dummy.index + dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: - raise ValueError('Dummy array must be length %d' % - self.chunksize) + raise ValueError(f'Dummy array must be length {self.chunksize}') return dummy, typ, index, ityp @@ -101,10 +90,9 @@ cdef class Reducer: cdef: char* dummy_buf ndarray arr, result, chunk - Py_ssize_t i, incr + Py_ssize_t i flatiter it - bint has_labels, has_ndarray_labels - object res, name, labels, index + object res, name, labels object cached_typ = None arr = self.arr @@ -112,44 +100,26 @@ cdef class Reducer: dummy_buf = chunk.data chunk.data = arr.data labels = self.labels - has_labels = labels is not None - has_ndarray_labels = util.is_array(labels) - has_index = self.index is not None - incr = self.increment + + result = np.empty(self.nresults, dtype='O') + it = PyArray_IterNew(result) try: for i in range(self.nresults): - if has_ndarray_labels: - name = labels[i] - elif has_labels: - # labels is an ExtensionArray - name = labels[i] - else: - name = None - # create the cached type # each time just reassign the data if i == 0: if self.typ is not None: - - # recreate with the index if supplied - if has_index: - - cached_typ = self.typ( - chunk, index=self.index, name=name) - - else: - - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # In this case, we also have self.index + name = labels[i] + cached_typ = self.typ(chunk, index=self.index, name=name) # use the cached_typ if possible if cached_typ is not None: - - if has_index: - object.__setattr__(cached_typ, 'index', self.index) + # In this case, we also have non-None labels + name = labels[i] object.__setattr__( cached_typ._data._block, 'values', chunk) @@ -158,46 +128,96 @@ cdef class Reducer: else: res = self.f(chunk) - if (not _is_sparse_array(res) and hasattr(res, 'values') - and util.is_array(res.values)): - res = res.values + # TODO: reason for not squeezing here? + res = _extract_result(res, squeeze=False) if i == 0: - result = _get_result_array(res, - self.nresults, - len(self.dummy)) - it = PyArray_IterNew(result) + # On the first pass, we check the output shape to see + # if this looks like a reduction. + _check_result_array(res, len(self.dummy)) PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) chunk.data = chunk.data + self.increment PyArray_ITER_NEXT(it) - except Exception as err: - if hasattr(err, 'args'): - err.args = err.args + (i,) - raise finally: # so we don't free the wrong memory chunk.data = dummy_buf - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result -cdef class SeriesBinGrouper: +cdef class _BaseGrouper: + cdef _check_dummy(self, dummy): + # both values and index must be an ndarray! + + values = dummy.values + # GH 23683: datetimetz types are equivalent to datetime types here + if (dummy.dtype != self.arr.dtype + and values.dtype != self.arr.dtype): + raise ValueError('Dummy array must be same dtype') + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute + values = values.copy() + index = dummy.index.values + if not index.flags.contiguous: + index = index.copy() + + return values, index + + cdef inline _update_cached_objs(self, object cached_typ, object cached_ityp, + Slider islider, Slider vslider): + if cached_typ is None: + cached_ityp = self.ityp(islider.buf) + cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) + else: + # See the comment in indexes/base.py about _index_data. + # We need this for EA-backed indexes that have a reference + # to a 1-d ndarray like datetime / timedelta / period. + object.__setattr__(cached_ityp, '_index_data', islider.buf) + cached_ityp._engine.clear_mapping() + object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__(cached_typ, '_index', cached_ityp) + object.__setattr__(cached_typ, 'name', self.name) + + return cached_typ, cached_ityp + + cdef inline object _apply_to_group(self, + object cached_typ, object cached_ityp, + Slider islider, Slider vslider, + Py_ssize_t group_size, bint initialized): + """ + Call self.f on our new group, then update to the next group. + """ + cached_ityp._engine.clear_mapping() + res = self.f(cached_typ) + res = _extract_result(res) + if not initialized: + # On the first pass, we check the output shape to see + # if this looks like a reduction. + initialized = 1 + _check_result_array(res, len(self.dummy_arr)) + + islider.advance(group_size) + vslider.advance(group_size) + + return res, initialized + + +cdef class SeriesBinGrouper(_BaseGrouper): """ Performs grouping operation according to bin edges, rather than labels """ cdef: Py_ssize_t nresults, ngroups - bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index + ndarray arr, index, dummy_arr, dummy_index object values, f, bins, typ, ityp, name def __init__(self, object series, object f, object bins, object dummy): - n = len(series) + + assert dummy is not None # always obj[:0] + assert len(bins) > 0 # otherwise we get IndexError in get_result self.bins = bins self.f = f @@ -210,10 +230,9 @@ cdef class SeriesBinGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - self.passed_dummy = dummy is not None # kludge for #1688 if len(bins) > 0 and bins[-1] == len(series): @@ -221,25 +240,6 @@ cdef class SeriesBinGrouper: else: self.ngroups = len(bins) + 1 - def _check_dummy(self, dummy=None): - # both values and index must be an ndarray! - - if dummy is None: - values = np.empty(0, dtype=self.arr.dtype) - index = None - else: - values = dummy.values - if values.dtype != self.arr.dtype: - raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy() - index = dummy.index.values - if not index.flags.contiguous: - index = index.copy() - - return values, index - def get_result(self): cdef: ndarray arr, result @@ -248,7 +248,7 @@ cdef class SeriesBinGrouper: object res bint initialized = 0 Slider vslider, islider - object name, cached_typ = None, cached_ityp = None + object cached_typ = None, cached_ityp = None counts = np.zeros(self.ngroups, dtype=np.int64) @@ -262,11 +262,12 @@ cdef class SeriesBinGrouper: group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(self.ngroups): group_size = counts[i] @@ -274,61 +275,46 @@ cdef class SeriesBinGrouper: islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider) - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) - result[i] = res + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) - islider.advance(group_size) - vslider.advance(group_size) + result[i] = res finally: # so we don't free the wrong memory islider.reset() vslider.reset() - if result.dtype == np.object_: - result = maybe_convert_objects(result) - + result = maybe_convert_objects(result) return result, counts -cdef class SeriesGrouper: +cdef class SeriesGrouper(_BaseGrouper): """ Performs generic grouping operation while avoiding ndarray construction overhead """ cdef: Py_ssize_t nresults, ngroups - bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index + ndarray arr, index, dummy_arr, dummy_index object f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, Py_ssize_t ngroups, object dummy): - n = len(series) + + # in practice we always pass either obj[:0] or the + # safer obj._get_values(slice(None, 0)) + assert dummy is not None + + if len(series) == 0: + # get_result would never assign `result` + raise ValueError("SeriesGrouper requires non-empty `series`") self.labels = labels self.f = f @@ -341,33 +327,11 @@ cdef class SeriesGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series, 'name', None) + self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - self.passed_dummy = dummy is not None self.ngroups = ngroups - def _check_dummy(self, dummy=None): - # both values and index must be an ndarray! - - if dummy is None: - values = np.empty(0, dtype=self.arr.dtype) - index = None - else: - values = dummy.values - # GH 23683: datetimetz types are equivalent to datetime types here - if (dummy.dtype != self.arr.dtype - and values.dtype != self.arr.dtype): - raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy() - index = dummy.index.values - if not index.flags.contiguous: - index = index.copy() - - return values, index - def get_result(self): cdef: # Define result to avoid UnboundLocalError @@ -377,17 +341,18 @@ cdef class SeriesGrouper: object res bint initialized = 0 Slider vslider, islider - object name, cached_typ=None, cached_ityp=None + object cached_typ = None, cached_ityp = None labels = self.labels counts = np.zeros(self.ngroups, dtype=np.int64) group_size = 0 n = len(self.arr) - name = self.name vslider = Slider(self.arr, self.dummy_arr) islider = Slider(self.index, self.dummy_index) + result = np.empty(self.ngroups, dtype='O') + try: for i in range(n): group_size += 1 @@ -404,32 +369,15 @@ cdef class SeriesGrouper: islider.set_length(group_size) vslider.set_length(group_size) - if cached_typ is None: - cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, - name=name) - else: - object.__setattr__(cached_ityp, '_data', islider.buf) - cached_ityp._engine.clear_mapping() - object.__setattr__( - cached_typ._data._block, 'values', vslider.buf) - object.__setattr__(cached_typ, '_index', cached_ityp) - object.__setattr__(cached_typ, 'name', name) - - cached_ityp._engine.clear_mapping() - res = self.f(cached_typ) - res = _extract_result(res) - if not initialized: - initialized = 1 - result = _get_result_array(res, - self.ngroups, - len(self.dummy_arr)) + cached_typ, cached_ityp = self._update_cached_objs( + cached_typ, cached_ityp, islider, vslider) + + res, initialized = self._apply_to_group(cached_typ, cached_ityp, + islider, vslider, + group_size, initialized) result[lab] = res counts[lab] = group_size - islider.advance(group_size) - vslider.advance(group_size) - group_size = 0 finally: @@ -437,27 +385,25 @@ cdef class SeriesGrouper: islider.reset() vslider.reset() - if result is None: - raise ValueError("No result.") + # We check for empty series in the constructor, so should always + # have result initialized by this point. + assert initialized, "`result` has not been initialized." - if result.dtype == np.object_: - result = maybe_convert_objects(result) + result = maybe_convert_objects(result) return result, counts -cdef inline _extract_result(object res): +cdef inline _extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ - if (not _is_sparse_array(res) and hasattr(res, 'values') - and util.is_array(res.values)): + if hasattr(res, 'values') and util.is_array(res.values): res = res.values - if not np.isscalar(res): - if util.is_array(res): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if util.is_array(res): + if res.ndim == 0: + res = res.item() + elif squeeze and res.ndim == 1 and len(res) == 1: + res = res[0] return res @@ -470,14 +416,13 @@ cdef class Slider: Py_ssize_t stride, orig_len, orig_stride char *orig_data - def __init__(self, object values, object buf): - assert (values.ndim == 1) + def __init__(self, ndarray values, ndarray buf): + assert values.ndim == 1 + assert values.dtype == buf.dtype - if util.is_array(values) and not values.flags.contiguous: - # e.g. Categorical has no `flags` attribute + if not values.flags.contiguous: values = values.copy() - assert (values.dtype == buf.dtype) self.values = values self.buf = buf self.stride = values.strides[0] @@ -489,7 +434,7 @@ cdef class Slider: self.buf.data = self.values.data self.buf.strides[0] = self.stride - cpdef advance(self, Py_ssize_t k): + cdef advance(self, Py_ssize_t k): self.buf.data = self.buf.data + self.stride * k cdef move(self, int start, int end): @@ -499,10 +444,10 @@ cdef class Slider: self.buf.data = self.values.data + self.stride * start self.buf.shape[0] = end - start - cpdef set_length(self, Py_ssize_t length): + cdef set_length(self, Py_ssize_t length): self.buf.shape[0] = length - cpdef reset(self): + cdef reset(self): self.buf.shape[0] = self.orig_len self.buf.data = self.orig_data @@ -522,8 +467,8 @@ def apply_frame_axis0(object frame, object f, object names, object piece dict item_cache - if frame.index._has_complex_internals: - raise InvalidApply('Cannot modify frame index internals') + # We have already checked that we don't have a MultiIndex before calling + assert frame.index.nlevels == 1 results = [] @@ -548,14 +493,19 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is chunk.index: - piece = piece.copy(deep='all') - else: + if piece.index is not chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int pass + if not is_scalar(piece): + # Need to copy data to avoid appending references + if hasattr(piece, "copy"): + piece = piece.copy(deep="all") + else: + piece = copy(piece) + results.append(piece) # If the data was modified inplace we need to @@ -607,10 +557,10 @@ cdef class BlockSlider: def __dealloc__(self): free(self.base_ptrs) - cpdef move(self, int start, int end): + cdef move(self, int start, int end): cdef: ndarray arr - object index + Py_ssize_t i # move blocks for i in range(self.nblocks): @@ -629,6 +579,7 @@ cdef class BlockSlider: cdef reset(self): cdef: ndarray arr + Py_ssize_t i # reset blocks for i in range(self.nblocks): @@ -639,21 +590,25 @@ cdef class BlockSlider: arr.shape[1] = 0 -def compute_reduction(arr, f, axis=0, dummy=None, labels=None): +def compute_reduction(arr: np.ndarray, f, axis: int = 0, dummy=None, labels=None): """ Parameters ----------- - arr : NDFrame object + arr : np.ndarray f : function axis : integer axis dummy : type of reduced output (series) labels : Index or None """ + # We either have both dummy and labels, or neither of them + if (labels is None) ^ (dummy is None): + raise ValueError("Must pass either dummy and labels, or neither") + if labels is not None: # Caller is responsible for ensuring we don't have MultiIndex - assert not labels._has_complex_internals + assert labels.nlevels == 1 # pass as an ndarray/ExtensionArray labels = labels._values diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 32aa936672aab..4e831081c8e54 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -28,7 +28,7 @@ def unstack(reshape_t[:, :] values, uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, reshape_t[:, :] new_values, uint8_t[:, :] new_mask): """ - transform long sorted_values to wide new_values + Transform long values to wide new_values. Parameters ---------- diff --git a/pandas/_libs/skiplist.pxd b/pandas/_libs/skiplist.pxd deleted file mode 100644 index e827223bbe0a7..0000000000000 --- a/pandas/_libs/skiplist.pxd +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- -# See GH#27465 for reference on related-but-unused cython code - -cdef extern from "src/skiplist.h": - ctypedef struct node_t: - node_t **next - int *width - double value - int is_nil - int levels - int ref_count - - ctypedef struct skiplist_t: - node_t *head - node_t **tmp_chain - int *tmp_steps - int size - int maxlevels - - skiplist_t* skiplist_init(int) nogil - void skiplist_destroy(skiplist_t*) nogil - double skiplist_get(skiplist_t*, int, int*) nogil - int skiplist_insert(skiplist_t*, double) nogil - int skiplist_remove(skiplist_t*, double) nogil diff --git a/pandas/_libs/skiplist.pyx b/pandas/_libs/skiplist.pyx deleted file mode 100644 index eb750a478415a..0000000000000 --- a/pandas/_libs/skiplist.pyx +++ /dev/null @@ -1,7 +0,0 @@ -# Cython version of IndexableSkiplist, for implementing moving median -# with O(log n) updates -# Original author: Raymond Hettinger -# Original license: MIT -# Link: http://code.activestate.com/recipes/576930/ - -# Cython version: Wes McKinney diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 4906e45c884e9..ee83901040b36 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -51,13 +51,13 @@ cdef class IntIndex(SparseIndex): args = (self.length, self.indices) return IntIndex, args - def __repr__(self): + def __repr__(self) -> str: output = 'IntIndex\n' - output += 'Indices: %s\n' % repr(self.indices) + output += f'Indices: {repr(self.indices)}\n' return output @property - def nbytes(self): + def nbytes(self) -> int: return self.indices.nbytes def check_integrity(self): @@ -72,9 +72,8 @@ cdef class IntIndex(SparseIndex): """ if self.npoints > self.length: - msg = ("Too many indices. Expected " - "{exp} but found {act}").format( - exp=self.length, act=self.npoints) + msg = (f"Too many indices. Expected " + f"{self.length} but found {self.npoints}") raise ValueError(msg) # Indices are vacuously ordered and non-negative @@ -92,7 +91,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, IntIndex): return False @@ -104,7 +103,7 @@ cdef class IntIndex(SparseIndex): return same_length and same_indices @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints def to_int_index(self): @@ -341,19 +340,19 @@ cdef class BlockIndex(SparseIndex): args = (self.length, self.blocs, self.blengths) return BlockIndex, args - def __repr__(self): + def __repr__(self) -> str: output = 'BlockIndex\n' - output += 'Block locations: %s\n' % repr(self.blocs) - output += 'Block lengths: %s' % repr(self.blengths) + output += f'Block locations: {repr(self.blocs)}\n' + output += f'Block lengths: {repr(self.blengths)}' return output @property - def nbytes(self): + def nbytes(self) -> int: return self.blocs.nbytes + self.blengths.nbytes @property - def ngaps(self): + def ngaps(self) -> int: return self.length - self.npoints cpdef check_integrity(self): @@ -380,17 +379,16 @@ cdef class BlockIndex(SparseIndex): if i < self.nblocks - 1: if blocs[i] + blengths[i] > blocs[i + 1]: - raise ValueError('Block {idx} overlaps'.format(idx=i)) + raise ValueError(f'Block {i} overlaps') else: if blocs[i] + blengths[i] > self.length: - raise ValueError('Block {idx} extends beyond end' - .format(idx=i)) + raise ValueError(f'Block {i} extends beyond end') # no zero-length blocks if blengths[i] == 0: - raise ValueError('Zero-length block {idx}'.format(idx=i)) + raise ValueError(f'Zero-length block {i}') - def equals(self, other): + def equals(self, other) -> bool: if not isinstance(other, BlockIndex): return False @@ -597,7 +595,7 @@ cdef class BlockIndex(SparseIndex): result = np.empty(other.npoints, dtype=np.float64) - for 0 <= i < other.nblocks: + for i in range(other.nblocks): ocur = olocs[i] ocurlen = olens[i] @@ -746,9 +744,6 @@ cdef class BlockUnion(BlockMerge): nend = xend[xi] - # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi, - # mode, nend) - # done with y? if yi == ynblocks: self._set_current_indices(xi + 1, yi, mode) diff --git a/pandas/_libs/src/compat_helper.h b/pandas/_libs/src/compat_helper.h deleted file mode 100644 index 078069fb48af2..0000000000000 --- a/pandas/_libs/src/compat_helper.h +++ /dev/null @@ -1,50 +0,0 @@ -/* -Copyright (c) 2016, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#ifndef PANDAS__LIBS_SRC_COMPAT_HELPER_H_ -#define PANDAS__LIBS_SRC_COMPAT_HELPER_H_ - -#include "Python.h" -#include "inline_helper.h" - -/* -PySlice_GetIndicesEx changes signature in PY3 -but 3.6.1 in particular changes the behavior of this function slightly -https://bugs.python.org/issue27867 - - -In 3.6.1 PySlice_GetIndicesEx was changed to a macro -inadvertently breaking ABI compat. For now, undefing -the macro, which restores compat. -https://github.com/pandas-dev/pandas/issues/15961 -https://bugs.python.org/issue29943 -*/ - -#ifndef PYPY_VERSION -# if PY_VERSION_HEX < 0x03070000 && defined(PySlice_GetIndicesEx) -# undef PySlice_GetIndicesEx -# endif // PY_VERSION_HEX -#endif // PYPY_VERSION - -PANDAS_INLINE int slice_get_indices(PyObject *s, - Py_ssize_t length, - Py_ssize_t *start, - Py_ssize_t *stop, - Py_ssize_t *step, - Py_ssize_t *slicelength) { -#if PY_VERSION_HEX >= 0x03000000 - return PySlice_GetIndicesEx(s, length, start, stop, - step, slicelength); -#else - return PySlice_GetIndicesEx((PySliceObject *)s, length, start, - stop, step, slicelength); -#endif // PY_VERSION_HEX -} - -#endif // PANDAS__LIBS_SRC_COMPAT_HELPER_H_ diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 77ec519cc24da..bcf6350aa9090 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -498,7 +498,7 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) */ #define kh_n_buckets(h) ((h)->n_buckets) -/* More conenient interfaces */ +/* More convenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys diff --git a/pandas/_libs/src/msgpack/pack.h b/pandas/_libs/src/msgpack/pack.h deleted file mode 100644 index 02379c9188424..0000000000000 --- a/pandas/_libs/src/msgpack/pack.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * MessagePack for Python packing routine - * - * Copyright (C) 2009 Naoki INADA - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include "sysdep.h" -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -#define inline __inline -#endif - -typedef struct msgpack_packer { - char *buf; - size_t length; - size_t buf_size; - bool use_bin_type; -} msgpack_packer; - -typedef struct Packer Packer; - -static inline int msgpack_pack_int(msgpack_packer* pk, int d); -static inline int msgpack_pack_long(msgpack_packer* pk, long d); -static inline int msgpack_pack_long_long(msgpack_packer* pk, long long d); -static inline int msgpack_pack_unsigned_short(msgpack_packer* pk, unsigned short d); -static inline int msgpack_pack_unsigned_int(msgpack_packer* pk, unsigned int d); -static inline int msgpack_pack_unsigned_long(msgpack_packer* pk, unsigned long d); -//static inline int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d); - -static inline int msgpack_pack_uint8(msgpack_packer* pk, uint8_t d); -static inline int msgpack_pack_uint16(msgpack_packer* pk, uint16_t d); -static inline int msgpack_pack_uint32(msgpack_packer* pk, uint32_t d); -static inline int msgpack_pack_uint64(msgpack_packer* pk, uint64_t d); -static inline int msgpack_pack_int8(msgpack_packer* pk, int8_t d); -static inline int msgpack_pack_int16(msgpack_packer* pk, int16_t d); -static inline int msgpack_pack_int32(msgpack_packer* pk, int32_t d); -static inline int msgpack_pack_int64(msgpack_packer* pk, int64_t d); - -static inline int msgpack_pack_float(msgpack_packer* pk, float d); -static inline int msgpack_pack_double(msgpack_packer* pk, double d); - -static inline int msgpack_pack_nil(msgpack_packer* pk); -static inline int msgpack_pack_true(msgpack_packer* pk); -static inline int msgpack_pack_false(msgpack_packer* pk); - -static inline int msgpack_pack_array(msgpack_packer* pk, unsigned int n); - -static inline int msgpack_pack_map(msgpack_packer* pk, unsigned int n); - -static inline int msgpack_pack_raw(msgpack_packer* pk, size_t l); -static inline int msgpack_pack_bin(msgpack_packer* pk, size_t l); -static inline int msgpack_pack_raw_body(msgpack_packer* pk, const void* b, size_t l); - -static inline int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l); - -static inline int msgpack_pack_write(msgpack_packer* pk, const char *data, size_t l) -{ - char* buf = pk->buf; - size_t bs = pk->buf_size; - size_t len = pk->length; - - if (len + l > bs) { - bs = (len + l) * 2; - buf = (char*)realloc(buf, bs); - if (!buf) return -1; - } - memcpy(buf + len, data, l); - len += l; - - pk->buf = buf; - pk->buf_size = bs; - pk->length = len; - return 0; -} - -#define msgpack_pack_append_buffer(user, buf, len) \ - return msgpack_pack_write(user, (const char*)buf, len) - -#include "pack_template.h" - -#ifdef __cplusplus -} -#endif diff --git a/pandas/_libs/src/msgpack/pack_template.h b/pandas/_libs/src/msgpack/pack_template.h deleted file mode 100644 index 5d1088f4b7d78..0000000000000 --- a/pandas/_libs/src/msgpack/pack_template.h +++ /dev/null @@ -1,785 +0,0 @@ -/* - * MessagePack packing routine template - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#if defined(__LITTLE_ENDIAN__) -#define TAKE8_8(d) ((uint8_t*)&d)[0] -#define TAKE8_16(d) ((uint8_t*)&d)[0] -#define TAKE8_32(d) ((uint8_t*)&d)[0] -#define TAKE8_64(d) ((uint8_t*)&d)[0] -#elif defined(__BIG_ENDIAN__) -#define TAKE8_8(d) ((uint8_t*)&d)[0] -#define TAKE8_16(d) ((uint8_t*)&d)[1] -#define TAKE8_32(d) ((uint8_t*)&d)[3] -#define TAKE8_64(d) ((uint8_t*)&d)[7] -#endif - -#ifndef msgpack_pack_append_buffer -#error msgpack_pack_append_buffer callback is not defined -#endif - - -/* - * Integer - */ - -#define msgpack_pack_real_uint8(x, d) \ -do { \ - if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ - } else { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_8(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ -} while(0) - -#define msgpack_pack_real_uint16(x, d) \ -do { \ - if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ - } else if(d < (1<<8)) { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } else { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } \ -} while(0) - -#define msgpack_pack_real_uint32(x, d) \ -do { \ - if(d < (1<<8)) { \ - if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ - } else { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ - } else { \ - if(d < (1<<16)) { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else { \ - /* unsigned 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } \ - } \ -} while(0) - -#define msgpack_pack_real_uint64(x, d) \ -do { \ - if(d < (1ULL<<8)) { \ - if(d < (1ULL<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ - } else { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ - } else { \ - if(d < (1ULL<<16)) { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else if(d < (1ULL<<32)) { \ - /* unsigned 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } else { \ - /* unsigned 64 */ \ - unsigned char buf[9]; \ - buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ - msgpack_pack_append_buffer(x, buf, 9); \ - } \ - } \ -} while(0) - -#define msgpack_pack_real_int8(x, d) \ -do { \ - if(d < -(1<<5)) { \ - /* signed 8 */ \ - unsigned char buf[2] = {0xd0, TAKE8_8(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } else { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); \ - } \ -} while(0) - -#define msgpack_pack_real_int16(x, d) \ -do { \ - if(d < -(1<<5)) { \ - if(d < -(1<<7)) { \ - /* signed 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else { \ - /* signed 8 */ \ - unsigned char buf[2] = {0xd0, TAKE8_16(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ - } else if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_16(d), 1); \ - } else { \ - if(d < (1<<8)) { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_16(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } else { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } \ - } \ -} while(0) - -#define msgpack_pack_real_int32(x, d) \ -do { \ - if(d < -(1<<5)) { \ - if(d < -(1<<15)) { \ - /* signed 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } else if(d < -(1<<7)) { \ - /* signed 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else { \ - /* signed 8 */ \ - unsigned char buf[2] = {0xd0, TAKE8_32(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ - } else if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_32(d), 1); \ - } else { \ - if(d < (1<<8)) { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_32(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } else if(d < (1<<16)) { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else { \ - /* unsigned 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } \ - } \ -} while(0) - -#define msgpack_pack_real_int64(x, d) \ -do { \ - if(d < -(1LL<<5)) { \ - if(d < -(1LL<<15)) { \ - if(d < -(1LL<<31)) { \ - /* signed 64 */ \ - unsigned char buf[9]; \ - buf[0] = 0xd3; _msgpack_store64(&buf[1], d); \ - msgpack_pack_append_buffer(x, buf, 9); \ - } else { \ - /* signed 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xd2; _msgpack_store32(&buf[1], (int32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } \ - } else { \ - if(d < -(1<<7)) { \ - /* signed 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xd1; _msgpack_store16(&buf[1], (int16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } else { \ - /* signed 8 */ \ - unsigned char buf[2] = {0xd0, TAKE8_64(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } \ - } \ - } else if(d < (1<<7)) { \ - /* fixnum */ \ - msgpack_pack_append_buffer(x, &TAKE8_64(d), 1); \ - } else { \ - if(d < (1LL<<16)) { \ - if(d < (1<<8)) { \ - /* unsigned 8 */ \ - unsigned char buf[2] = {0xcc, TAKE8_64(d)}; \ - msgpack_pack_append_buffer(x, buf, 2); \ - } else { \ - /* unsigned 16 */ \ - unsigned char buf[3]; \ - buf[0] = 0xcd; _msgpack_store16(&buf[1], (uint16_t)d); \ - msgpack_pack_append_buffer(x, buf, 3); \ - } \ - } else { \ - if(d < (1LL<<32)) { \ - /* unsigned 32 */ \ - unsigned char buf[5]; \ - buf[0] = 0xce; _msgpack_store32(&buf[1], (uint32_t)d); \ - msgpack_pack_append_buffer(x, buf, 5); \ - } else { \ - /* unsigned 64 */ \ - unsigned char buf[9]; \ - buf[0] = 0xcf; _msgpack_store64(&buf[1], d); \ - msgpack_pack_append_buffer(x, buf, 9); \ - } \ - } \ - } \ -} while(0) - - -static inline int msgpack_pack_uint8(msgpack_packer* x, uint8_t d) -{ - msgpack_pack_real_uint8(x, d); -} - -static inline int msgpack_pack_uint16(msgpack_packer* x, uint16_t d) -{ - msgpack_pack_real_uint16(x, d); -} - -static inline int msgpack_pack_uint32(msgpack_packer* x, uint32_t d) -{ - msgpack_pack_real_uint32(x, d); -} - -static inline int msgpack_pack_uint64(msgpack_packer* x, uint64_t d) -{ - msgpack_pack_real_uint64(x, d); -} - -static inline int msgpack_pack_int8(msgpack_packer* x, int8_t d) -{ - msgpack_pack_real_int8(x, d); -} - -static inline int msgpack_pack_int16(msgpack_packer* x, int16_t d) -{ - msgpack_pack_real_int16(x, d); -} - -static inline int msgpack_pack_int32(msgpack_packer* x, int32_t d) -{ - msgpack_pack_real_int32(x, d); -} - -static inline int msgpack_pack_int64(msgpack_packer* x, int64_t d) -{ - msgpack_pack_real_int64(x, d); -} - - -//#ifdef msgpack_pack_inline_func_cint - -static inline int msgpack_pack_short(msgpack_packer* x, short d) -{ -#if defined(SIZEOF_SHORT) -#if SIZEOF_SHORT == 2 - msgpack_pack_real_int16(x, d); -#elif SIZEOF_SHORT == 4 - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#elif defined(SHRT_MAX) -#if SHRT_MAX == 0x7fff - msgpack_pack_real_int16(x, d); -#elif SHRT_MAX == 0x7fffffff - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#else -if(sizeof(short) == 2) { - msgpack_pack_real_int16(x, d); -} else if(sizeof(short) == 4) { - msgpack_pack_real_int32(x, d); -} else { - msgpack_pack_real_int64(x, d); -} -#endif -} - -static inline int msgpack_pack_int(msgpack_packer* x, int d) -{ -#if defined(SIZEOF_INT) -#if SIZEOF_INT == 2 - msgpack_pack_real_int16(x, d); -#elif SIZEOF_INT == 4 - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#elif defined(INT_MAX) -#if INT_MAX == 0x7fff - msgpack_pack_real_int16(x, d); -#elif INT_MAX == 0x7fffffff - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#else -if(sizeof(int) == 2) { - msgpack_pack_real_int16(x, d); -} else if(sizeof(int) == 4) { - msgpack_pack_real_int32(x, d); -} else { - msgpack_pack_real_int64(x, d); -} -#endif -} - -static inline int msgpack_pack_long(msgpack_packer* x, long d) -{ -#if defined(SIZEOF_LONG) -#if SIZEOF_LONG == 2 - msgpack_pack_real_int16(x, d); -#elif SIZEOF_LONG == 4 - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#elif defined(LONG_MAX) -#if LONG_MAX == 0x7fffL - msgpack_pack_real_int16(x, d); -#elif LONG_MAX == 0x7fffffffL - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#else -if(sizeof(long) == 2) { - msgpack_pack_real_int16(x, d); -} else if(sizeof(long) == 4) { - msgpack_pack_real_int32(x, d); -} else { - msgpack_pack_real_int64(x, d); -} -#endif -} - -static inline int msgpack_pack_long_long(msgpack_packer* x, long long d) -{ -#if defined(SIZEOF_LONG_LONG) -#if SIZEOF_LONG_LONG == 2 - msgpack_pack_real_int16(x, d); -#elif SIZEOF_LONG_LONG == 4 - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#elif defined(LLONG_MAX) -#if LLONG_MAX == 0x7fffL - msgpack_pack_real_int16(x, d); -#elif LLONG_MAX == 0x7fffffffL - msgpack_pack_real_int32(x, d); -#else - msgpack_pack_real_int64(x, d); -#endif - -#else -if(sizeof(long long) == 2) { - msgpack_pack_real_int16(x, d); -} else if(sizeof(long long) == 4) { - msgpack_pack_real_int32(x, d); -} else { - msgpack_pack_real_int64(x, d); -} -#endif -} - -static inline int msgpack_pack_unsigned_short(msgpack_packer* x, unsigned short d) -{ -#if defined(SIZEOF_SHORT) -#if SIZEOF_SHORT == 2 - msgpack_pack_real_uint16(x, d); -#elif SIZEOF_SHORT == 4 - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#elif defined(USHRT_MAX) -#if USHRT_MAX == 0xffffU - msgpack_pack_real_uint16(x, d); -#elif USHRT_MAX == 0xffffffffU - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#else -if(sizeof(unsigned short) == 2) { - msgpack_pack_real_uint16(x, d); -} else if(sizeof(unsigned short) == 4) { - msgpack_pack_real_uint32(x, d); -} else { - msgpack_pack_real_uint64(x, d); -} -#endif -} - -static inline int msgpack_pack_unsigned_int(msgpack_packer* x, unsigned int d) -{ -#if defined(SIZEOF_INT) -#if SIZEOF_INT == 2 - msgpack_pack_real_uint16(x, d); -#elif SIZEOF_INT == 4 - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#elif defined(UINT_MAX) -#if UINT_MAX == 0xffffU - msgpack_pack_real_uint16(x, d); -#elif UINT_MAX == 0xffffffffU - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#else -if(sizeof(unsigned int) == 2) { - msgpack_pack_real_uint16(x, d); -} else if(sizeof(unsigned int) == 4) { - msgpack_pack_real_uint32(x, d); -} else { - msgpack_pack_real_uint64(x, d); -} -#endif -} - -static inline int msgpack_pack_unsigned_long(msgpack_packer* x, unsigned long d) -{ -#if defined(SIZEOF_LONG) -#if SIZEOF_LONG == 2 - msgpack_pack_real_uint16(x, d); -#elif SIZEOF_LONG == 4 - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#elif defined(ULONG_MAX) -#if ULONG_MAX == 0xffffUL - msgpack_pack_real_uint16(x, d); -#elif ULONG_MAX == 0xffffffffUL - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#else -if(sizeof(unsigned long) == 2) { - msgpack_pack_real_uint16(x, d); -} else if(sizeof(unsigned long) == 4) { - msgpack_pack_real_uint32(x, d); -} else { - msgpack_pack_real_uint64(x, d); -} -#endif -} - -static inline int msgpack_pack_unsigned_long_long(msgpack_packer* x, unsigned long long d) -{ -#if defined(SIZEOF_LONG_LONG) -#if SIZEOF_LONG_LONG == 2 - msgpack_pack_real_uint16(x, d); -#elif SIZEOF_LONG_LONG == 4 - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#elif defined(ULLONG_MAX) -#if ULLONG_MAX == 0xffffUL - msgpack_pack_real_uint16(x, d); -#elif ULLONG_MAX == 0xffffffffUL - msgpack_pack_real_uint32(x, d); -#else - msgpack_pack_real_uint64(x, d); -#endif - -#else -if(sizeof(unsigned long long) == 2) { - msgpack_pack_real_uint16(x, d); -} else if(sizeof(unsigned long long) == 4) { - msgpack_pack_real_uint32(x, d); -} else { - msgpack_pack_real_uint64(x, d); -} -#endif -} - -//#undef msgpack_pack_inline_func_cint -//#endif - - - -/* - * Float - */ - -static inline int msgpack_pack_float(msgpack_packer* x, float d) -{ - union { float f; uint32_t i; } mem; - mem.f = d; - unsigned char buf[5]; - buf[0] = 0xca; _msgpack_store32(&buf[1], mem.i); - msgpack_pack_append_buffer(x, buf, 5); -} - -static inline int msgpack_pack_double(msgpack_packer* x, double d) -{ - union { double f; uint64_t i; } mem; - mem.f = d; - unsigned char buf[9]; - buf[0] = 0xcb; -#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi - // https://github.com/msgpack/msgpack-perl/pull/1 - mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); -#endif - _msgpack_store64(&buf[1], mem.i); - msgpack_pack_append_buffer(x, buf, 9); -} - - -/* - * Nil - */ - -static inline int msgpack_pack_nil(msgpack_packer* x) -{ - static const unsigned char d = 0xc0; - msgpack_pack_append_buffer(x, &d, 1); -} - - -/* - * Boolean - */ - -static inline int msgpack_pack_true(msgpack_packer* x) -{ - static const unsigned char d = 0xc3; - msgpack_pack_append_buffer(x, &d, 1); -} - -static inline int msgpack_pack_false(msgpack_packer* x) -{ - static const unsigned char d = 0xc2; - msgpack_pack_append_buffer(x, &d, 1); -} - - -/* - * Array - */ - -static inline int msgpack_pack_array(msgpack_packer* x, unsigned int n) -{ - if(n < 16) { - unsigned char d = 0x90 | n; - msgpack_pack_append_buffer(x, &d, 1); - } else if(n < 65536) { - unsigned char buf[3]; - buf[0] = 0xdc; _msgpack_store16(&buf[1], (uint16_t)n); - msgpack_pack_append_buffer(x, buf, 3); - } else { - unsigned char buf[5]; - buf[0] = 0xdd; _msgpack_store32(&buf[1], (uint32_t)n); - msgpack_pack_append_buffer(x, buf, 5); - } -} - - -/* - * Map - */ - -static inline int msgpack_pack_map(msgpack_packer* x, unsigned int n) -{ - if(n < 16) { - unsigned char d = 0x80 | n; - msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); - } else if(n < 65536) { - unsigned char buf[3]; - buf[0] = 0xde; _msgpack_store16(&buf[1], (uint16_t)n); - msgpack_pack_append_buffer(x, buf, 3); - } else { - unsigned char buf[5]; - buf[0] = 0xdf; _msgpack_store32(&buf[1], (uint32_t)n); - msgpack_pack_append_buffer(x, buf, 5); - } -} - - -/* - * Raw - */ - -static inline int msgpack_pack_raw(msgpack_packer* x, size_t l) -{ - if (l < 32) { - unsigned char d = 0xa0 | (uint8_t)l; - msgpack_pack_append_buffer(x, &TAKE8_8(d), 1); - } else if (x->use_bin_type && l < 256) { // str8 is new format introduced with bin. - unsigned char buf[2] = {0xd9, (uint8_t)l}; - msgpack_pack_append_buffer(x, buf, 2); - } else if (l < 65536) { - unsigned char buf[3]; - buf[0] = 0xda; _msgpack_store16(&buf[1], (uint16_t)l); - msgpack_pack_append_buffer(x, buf, 3); - } else { - unsigned char buf[5]; - buf[0] = 0xdb; _msgpack_store32(&buf[1], (uint32_t)l); - msgpack_pack_append_buffer(x, buf, 5); - } -} - -/* - * bin - */ -static inline int msgpack_pack_bin(msgpack_packer *x, size_t l) -{ - if (!x->use_bin_type) { - return msgpack_pack_raw(x, l); - } - if (l < 256) { - unsigned char buf[2] = {0xc4, (unsigned char)l}; - msgpack_pack_append_buffer(x, buf, 2); - } else if (l < 65536) { - unsigned char buf[3] = {0xc5}; - _msgpack_store16(&buf[1], (uint16_t)l); - msgpack_pack_append_buffer(x, buf, 3); - } else { - unsigned char buf[5] = {0xc6}; - _msgpack_store32(&buf[1], (uint32_t)l); - msgpack_pack_append_buffer(x, buf, 5); - } -} - -static inline int msgpack_pack_raw_body(msgpack_packer* x, const void* b, size_t l) -{ - if (l > 0) msgpack_pack_append_buffer(x, (const unsigned char*)b, l); - return 0; -} - -/* - * Ext - */ -static inline int msgpack_pack_ext(msgpack_packer* x, char typecode, size_t l) -{ - if (l == 1) { - unsigned char buf[2]; - buf[0] = 0xd4; - buf[1] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 2); - } - else if(l == 2) { - unsigned char buf[2]; - buf[0] = 0xd5; - buf[1] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 2); - } - else if(l == 4) { - unsigned char buf[2]; - buf[0] = 0xd6; - buf[1] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 2); - } - else if(l == 8) { - unsigned char buf[2]; - buf[0] = 0xd7; - buf[1] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 2); - } - else if(l == 16) { - unsigned char buf[2]; - buf[0] = 0xd8; - buf[1] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 2); - } - else if(l < 256) { - unsigned char buf[3]; - buf[0] = 0xc7; - buf[1] = l; - buf[2] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 3); - } else if(l < 65536) { - unsigned char buf[4]; - buf[0] = 0xc8; - _msgpack_store16(&buf[1], (uint16_t)l); - buf[3] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 4); - } else { - unsigned char buf[6]; - buf[0] = 0xc9; - _msgpack_store32(&buf[1], (uint32_t)l); - buf[5] = (unsigned char)typecode; - msgpack_pack_append_buffer(x, buf, 6); - } - -} - - - -#undef msgpack_pack_append_buffer - -#undef TAKE8_8 -#undef TAKE8_16 -#undef TAKE8_32 -#undef TAKE8_64 - -#undef msgpack_pack_real_uint8 -#undef msgpack_pack_real_uint16 -#undef msgpack_pack_real_uint32 -#undef msgpack_pack_real_uint64 -#undef msgpack_pack_real_int8 -#undef msgpack_pack_real_int16 -#undef msgpack_pack_real_int32 -#undef msgpack_pack_real_int64 diff --git a/pandas/_libs/src/msgpack/sysdep.h b/pandas/_libs/src/msgpack/sysdep.h deleted file mode 100644 index ed9c1bc0b8031..0000000000000 --- a/pandas/_libs/src/msgpack/sysdep.h +++ /dev/null @@ -1,194 +0,0 @@ -/* - * MessagePack system dependencies - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MSGPACK_SYSDEP_H__ -#define MSGPACK_SYSDEP_H__ - -#include -#include -#if defined(_MSC_VER) && _MSC_VER < 1600 -typedef __int8 int8_t; -typedef unsigned __int8 uint8_t; -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#elif defined(_MSC_VER) // && _MSC_VER >= 1600 -#include -#else -#include -#include -#endif - -#ifdef _WIN32 -#define _msgpack_atomic_counter_header -typedef long _msgpack_atomic_counter_t; -#define _msgpack_sync_decr_and_fetch(ptr) InterlockedDecrement(ptr) -#define _msgpack_sync_incr_and_fetch(ptr) InterlockedIncrement(ptr) -#elif defined(__GNUC__) && ((__GNUC__*10 + __GNUC_MINOR__) < 41) -#define _msgpack_atomic_counter_header "gcc_atomic.h" -#else -typedef unsigned int _msgpack_atomic_counter_t; -#define _msgpack_sync_decr_and_fetch(ptr) __sync_sub_and_fetch(ptr, 1) -#define _msgpack_sync_incr_and_fetch(ptr) __sync_add_and_fetch(ptr, 1) -#endif - -#ifdef _WIN32 - -#ifdef __cplusplus -/* numeric_limits::min,max */ -#ifdef max -#undef max -#endif -#ifdef min -#undef min -#endif -#endif - -#else -#include /* __BYTE_ORDER */ -#endif - -#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define __LITTLE_ENDIAN__ -#elif __BYTE_ORDER == __BIG_ENDIAN -#define __BIG_ENDIAN__ -#elif _WIN32 -#define __LITTLE_ENDIAN__ -#endif -#endif - - -#ifdef __LITTLE_ENDIAN__ - -#ifdef _WIN32 -# if defined(ntohs) -# define _msgpack_be16(x) ntohs(x) -# elif defined(_byteswap_ushort) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be16(x) ((uint16_t)_byteswap_ushort((unsigned short)x)) -# else -# define _msgpack_be16(x) ( \ - ((((uint16_t)x) << 8) ) | \ - ((((uint16_t)x) >> 8) ) ) -# endif -#else -# define _msgpack_be16(x) ntohs(x) -#endif - -#ifdef _WIN32 -# if defined(ntohl) -# define _msgpack_be32(x) ntohl(x) -# elif defined(_byteswap_ulong) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be32(x) ((uint32_t)_byteswap_ulong((unsigned long)x)) -# else -# define _msgpack_be32(x) \ - ( ((((uint32_t)x) << 24) ) | \ - ((((uint32_t)x) << 8) & 0x00ff0000U ) | \ - ((((uint32_t)x) >> 8) & 0x0000ff00U ) | \ - ((((uint32_t)x) >> 24) ) ) -# endif -#else -# define _msgpack_be32(x) ntohl(x) -#endif - -#if defined(_byteswap_uint64) || (defined(_MSC_VER) && _MSC_VER >= 1400) -# define _msgpack_be64(x) (_byteswap_uint64(x)) -#elif defined(bswap_64) -# define _msgpack_be64(x) bswap_64(x) -#elif defined(__DARWIN_OSSwapInt64) -# define _msgpack_be64(x) __DARWIN_OSSwapInt64(x) -#else -#define _msgpack_be64(x) \ - ( ((((uint64_t)x) << 56) ) | \ - ((((uint64_t)x) << 40) & 0x00ff000000000000ULL ) | \ - ((((uint64_t)x) << 24) & 0x0000ff0000000000ULL ) | \ - ((((uint64_t)x) << 8) & 0x000000ff00000000ULL ) | \ - ((((uint64_t)x) >> 8) & 0x00000000ff000000ULL ) | \ - ((((uint64_t)x) >> 24) & 0x0000000000ff0000ULL ) | \ - ((((uint64_t)x) >> 40) & 0x000000000000ff00ULL ) | \ - ((((uint64_t)x) >> 56) ) ) -#endif - -#define _msgpack_load16(cast, from) ((cast)( \ - (((uint16_t)((uint8_t*)(from))[0]) << 8) | \ - (((uint16_t)((uint8_t*)(from))[1]) ) )) - -#define _msgpack_load32(cast, from) ((cast)( \ - (((uint32_t)((uint8_t*)(from))[0]) << 24) | \ - (((uint32_t)((uint8_t*)(from))[1]) << 16) | \ - (((uint32_t)((uint8_t*)(from))[2]) << 8) | \ - (((uint32_t)((uint8_t*)(from))[3]) ) )) - -#define _msgpack_load64(cast, from) ((cast)( \ - (((uint64_t)((uint8_t*)(from))[0]) << 56) | \ - (((uint64_t)((uint8_t*)(from))[1]) << 48) | \ - (((uint64_t)((uint8_t*)(from))[2]) << 40) | \ - (((uint64_t)((uint8_t*)(from))[3]) << 32) | \ - (((uint64_t)((uint8_t*)(from))[4]) << 24) | \ - (((uint64_t)((uint8_t*)(from))[5]) << 16) | \ - (((uint64_t)((uint8_t*)(from))[6]) << 8) | \ - (((uint64_t)((uint8_t*)(from))[7]) ) )) - -#else - -#define _msgpack_be16(x) (x) -#define _msgpack_be32(x) (x) -#define _msgpack_be64(x) (x) - -#define _msgpack_load16(cast, from) ((cast)( \ - (((uint16_t)((uint8_t*)from)[0]) << 8) | \ - (((uint16_t)((uint8_t*)from)[1]) ) )) - -#define _msgpack_load32(cast, from) ((cast)( \ - (((uint32_t)((uint8_t*)from)[0]) << 24) | \ - (((uint32_t)((uint8_t*)from)[1]) << 16) | \ - (((uint32_t)((uint8_t*)from)[2]) << 8) | \ - (((uint32_t)((uint8_t*)from)[3]) ) )) - -#define _msgpack_load64(cast, from) ((cast)( \ - (((uint64_t)((uint8_t*)from)[0]) << 56) | \ - (((uint64_t)((uint8_t*)from)[1]) << 48) | \ - (((uint64_t)((uint8_t*)from)[2]) << 40) | \ - (((uint64_t)((uint8_t*)from)[3]) << 32) | \ - (((uint64_t)((uint8_t*)from)[4]) << 24) | \ - (((uint64_t)((uint8_t*)from)[5]) << 16) | \ - (((uint64_t)((uint8_t*)from)[6]) << 8) | \ - (((uint64_t)((uint8_t*)from)[7]) ) )) -#endif - - -#define _msgpack_store16(to, num) \ - do { uint16_t val = _msgpack_be16(num); memcpy(to, &val, 2); } while(0) -#define _msgpack_store32(to, num) \ - do { uint32_t val = _msgpack_be32(num); memcpy(to, &val, 4); } while(0) -#define _msgpack_store64(to, num) \ - do { uint64_t val = _msgpack_be64(num); memcpy(to, &val, 8); } while(0) - -/* -#define _msgpack_load16(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 2); _msgpack_be16(val); }) -#define _msgpack_load32(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 4); _msgpack_be32(val); }) -#define _msgpack_load64(cast, from) \ - ({ cast val; memcpy(&val, (char*)from, 8); _msgpack_be64(val); }) -*/ - - -#endif /* msgpack/sysdep.h */ diff --git a/pandas/_libs/src/msgpack/unpack.h b/pandas/_libs/src/msgpack/unpack.h deleted file mode 100644 index 591fad1ae4661..0000000000000 --- a/pandas/_libs/src/msgpack/unpack.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * MessagePack for Python unpacking routine - * - * Copyright (C) 2009 Naoki INADA - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define MSGPACK_EMBED_STACK_SIZE (1024) -#include "unpack_define.h" - -typedef struct unpack_user { - int use_list; - PyObject *object_hook; - bool has_pairs_hook; - PyObject *list_hook; - PyObject *ext_hook; - const char *encoding; - const char *unicode_errors; - Py_ssize_t max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len; -} unpack_user; - -typedef PyObject* msgpack_unpack_object; -struct unpack_context; -typedef struct unpack_context unpack_context; -typedef int (*execute_fn)(unpack_context *ctx, const char* data, size_t len, size_t* off); - -static inline msgpack_unpack_object unpack_callback_root(unpack_user* u) -{ - return NULL; -} - -static inline int unpack_callback_uint16(unpack_user* u, uint16_t d, msgpack_unpack_object* o) -{ - PyObject *p = PyInt_FromLong((long)d); - if (!p) - return -1; - *o = p; - return 0; -} -static inline int unpack_callback_uint8(unpack_user* u, uint8_t d, msgpack_unpack_object* o) -{ - return unpack_callback_uint16(u, d, o); -} - - -static inline int unpack_callback_uint32(unpack_user* u, uint32_t d, msgpack_unpack_object* o) -{ - PyObject *p = PyInt_FromSize_t((size_t)d); - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_uint64(unpack_user* u, uint64_t d, msgpack_unpack_object* o) -{ - PyObject *p; - if (d > LONG_MAX) { - p = PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG)d); - } else { - p = PyInt_FromSize_t((size_t)d); - } - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_int32(unpack_user* u, int32_t d, msgpack_unpack_object* o) -{ - PyObject *p = PyInt_FromLong(d); - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_int16(unpack_user* u, int16_t d, msgpack_unpack_object* o) -{ - return unpack_callback_int32(u, d, o); -} - -static inline int unpack_callback_int8(unpack_user* u, int8_t d, msgpack_unpack_object* o) -{ - return unpack_callback_int32(u, d, o); -} - -static inline int unpack_callback_int64(unpack_user* u, int64_t d, msgpack_unpack_object* o) -{ - PyObject *p; - if (d > LONG_MAX || d < LONG_MIN) { - p = PyLong_FromLongLong((unsigned PY_LONG_LONG)d); - } else { - p = PyInt_FromLong((long)d); - } - *o = p; - return 0; -} - -static inline int unpack_callback_double(unpack_user* u, double d, msgpack_unpack_object* o) -{ - PyObject *p = PyFloat_FromDouble(d); - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_float(unpack_user* u, float d, msgpack_unpack_object* o) -{ - return unpack_callback_double(u, d, o); -} - -static inline int unpack_callback_nil(unpack_user* u, msgpack_unpack_object* o) -{ Py_INCREF(Py_None); *o = Py_None; return 0; } - -static inline int unpack_callback_true(unpack_user* u, msgpack_unpack_object* o) -{ Py_INCREF(Py_True); *o = Py_True; return 0; } - -static inline int unpack_callback_false(unpack_user* u, msgpack_unpack_object* o) -{ Py_INCREF(Py_False); *o = Py_False; return 0; } - -static inline int unpack_callback_array(unpack_user* u, unsigned int n, msgpack_unpack_object* o) -{ - if (n > u->max_array_len) { - PyErr_Format(PyExc_ValueError, "%u exceeds max_array_len(%zd)", n, u->max_array_len); - return -1; - } - PyObject *p = u->use_list ? PyList_New(n) : PyTuple_New(n); - - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_array_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object o) -{ - if (u->use_list) - PyList_SET_ITEM(*c, current, o); - else - PyTuple_SET_ITEM(*c, current, o); - return 0; -} - -static inline int unpack_callback_array_end(unpack_user* u, msgpack_unpack_object* c) -{ - if (u->list_hook) { - PyObject *new_c = PyObject_CallFunctionObjArgs(u->list_hook, *c, NULL); - if (!new_c) - return -1; - Py_DECREF(*c); - *c = new_c; - } - return 0; -} - -static inline int unpack_callback_map(unpack_user* u, unsigned int n, msgpack_unpack_object* o) -{ - if (n > u->max_map_len) { - PyErr_Format(PyExc_ValueError, "%u exceeds max_map_len(%zd)", n, u->max_map_len); - return -1; - } - PyObject *p; - if (u->has_pairs_hook) { - p = PyList_New(n); // Or use tuple? - } - else { - p = PyDict_New(); - } - if (!p) - return -1; - *o = p; - return 0; -} - -static inline int unpack_callback_map_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object k, msgpack_unpack_object v) -{ - if (u->has_pairs_hook) { - msgpack_unpack_object item = PyTuple_Pack(2, k, v); - if (!item) - return -1; - Py_DECREF(k); - Py_DECREF(v); - PyList_SET_ITEM(*c, current, item); - return 0; - } - else if (PyDict_SetItem(*c, k, v) == 0) { - Py_DECREF(k); - Py_DECREF(v); - return 0; - } - return -1; -} - -static inline int unpack_callback_map_end(unpack_user* u, msgpack_unpack_object* c) -{ - if (u->object_hook) { - PyObject *new_c = PyObject_CallFunctionObjArgs(u->object_hook, *c, NULL); - if (!new_c) - return -1; - - Py_DECREF(*c); - *c = new_c; - } - return 0; -} - -static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o) -{ - if (l > u->max_str_len) { - PyErr_Format(PyExc_ValueError, "%u exceeds max_str_len(%zd)", l, u->max_str_len); - return -1; - } - - PyObject *py; - if(u->encoding) { - py = PyUnicode_Decode(p, l, u->encoding, u->unicode_errors); - } else { - py = PyBytes_FromStringAndSize(p, l); - } - if (!py) - return -1; - *o = py; - return 0; -} - -static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int l, msgpack_unpack_object* o) -{ - if (l > u->max_bin_len) { - PyErr_Format(PyExc_ValueError, "%u exceeds max_bin_len(%zd)", l, u->max_bin_len); - return -1; - } - - PyObject *py = PyBytes_FromStringAndSize(p, l); - if (!py) - return -1; - *o = py; - return 0; -} - -static inline int unpack_callback_ext(unpack_user* u, const char* base, const char* pos, - unsigned int length, msgpack_unpack_object* o) -{ - PyObject *py; - int8_t typecode = (int8_t)*pos++; - if (!u->ext_hook) { - PyErr_SetString(PyExc_AssertionError, "u->ext_hook cannot be NULL"); - return -1; - } - if (length-1 > u->max_ext_len) { - PyErr_Format(PyExc_ValueError, "%u exceeds max_ext_len(%zd)", length, u->max_ext_len); - return -1; - } - // length also includes the typecode, so the actual data is length-1 -#if PY_MAJOR_VERSION == 2 - py = PyObject_CallFunction(u->ext_hook, (char*)"(is#)", typecode, pos, (Py_ssize_t)length-1); -#else - py = PyObject_CallFunction(u->ext_hook, (char*)"(iy#)", typecode, pos, (Py_ssize_t)length-1); -#endif - if (!py) - return -1; - *o = py; - return 0; -} - -#include "unpack_template.h" diff --git a/pandas/_libs/src/msgpack/unpack_define.h b/pandas/_libs/src/msgpack/unpack_define.h deleted file mode 100644 index 0dd708d17c3d4..0000000000000 --- a/pandas/_libs/src/msgpack/unpack_define.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * MessagePack unpacking routine template - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef MSGPACK_UNPACK_DEFINE_H__ -#define MSGPACK_UNPACK_DEFINE_H__ - -#include "msgpack/sysdep.h" -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -#ifndef MSGPACK_EMBED_STACK_SIZE -#define MSGPACK_EMBED_STACK_SIZE 32 -#endif - - -// CS is first byte & 0x1f -typedef enum { - CS_HEADER = 0x00, // nil - - //CS_ = 0x01, - //CS_ = 0x02, // false - //CS_ = 0x03, // true - - CS_BIN_8 = 0x04, - CS_BIN_16 = 0x05, - CS_BIN_32 = 0x06, - - CS_EXT_8 = 0x07, - CS_EXT_16 = 0x08, - CS_EXT_32 = 0x09, - - CS_FLOAT = 0x0a, - CS_DOUBLE = 0x0b, - CS_UINT_8 = 0x0c, - CS_UINT_16 = 0x0d, - CS_UINT_32 = 0x0e, - CS_UINT_64 = 0x0f, - CS_INT_8 = 0x10, - CS_INT_16 = 0x11, - CS_INT_32 = 0x12, - CS_INT_64 = 0x13, - - //CS_FIXEXT1 = 0x14, - //CS_FIXEXT2 = 0x15, - //CS_FIXEXT4 = 0x16, - //CS_FIXEXT8 = 0x17, - //CS_FIXEXT16 = 0x18, - - CS_RAW_8 = 0x19, - CS_RAW_16 = 0x1a, - CS_RAW_32 = 0x1b, - CS_ARRAY_16 = 0x1c, - CS_ARRAY_32 = 0x1d, - CS_MAP_16 = 0x1e, - CS_MAP_32 = 0x1f, - - ACS_RAW_VALUE, - ACS_BIN_VALUE, - ACS_EXT_VALUE, -} msgpack_unpack_state; - - -typedef enum { - CT_ARRAY_ITEM, - CT_MAP_KEY, - CT_MAP_VALUE, -} msgpack_container_type; - - -#ifdef __cplusplus -} -#endif - -#endif /* msgpack/unpack_define.h */ diff --git a/pandas/_libs/src/msgpack/unpack_template.h b/pandas/_libs/src/msgpack/unpack_template.h deleted file mode 100644 index 402dcd48cb35a..0000000000000 --- a/pandas/_libs/src/msgpack/unpack_template.h +++ /dev/null @@ -1,475 +0,0 @@ -/* - * MessagePack unpacking routine template - * - * Copyright (C) 2008-2010 FURUHASHI Sadayuki - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef USE_CASE_RANGE -#ifdef __GNUC__ -#define USE_CASE_RANGE -#endif -#endif - -typedef struct unpack_stack { - PyObject* obj; - size_t size; - size_t count; - unsigned int ct; - PyObject* map_key; -} unpack_stack; - -struct unpack_context { - unpack_user user; - unsigned int cs; - unsigned int trail; - unsigned int top; - /* - unpack_stack* stack; - unsigned int stack_size; - unpack_stack embed_stack[MSGPACK_EMBED_STACK_SIZE]; - */ - unpack_stack stack[MSGPACK_EMBED_STACK_SIZE]; -}; - - -static inline void unpack_init(unpack_context* ctx) -{ - ctx->cs = CS_HEADER; - ctx->trail = 0; - ctx->top = 0; - /* - ctx->stack = ctx->embed_stack; - ctx->stack_size = MSGPACK_EMBED_STACK_SIZE; - */ - ctx->stack[0].obj = unpack_callback_root(&ctx->user); -} - -/* -static inline void unpack_destroy(unpack_context* ctx) -{ - if(ctx->stack_size != MSGPACK_EMBED_STACK_SIZE) { - free(ctx->stack); - } -} -*/ - -static inline PyObject* unpack_data(unpack_context* ctx) -{ - return (ctx)->stack[0].obj; -} - - -template -static inline int unpack_execute(unpack_context* ctx, const char* data, size_t len, size_t* off) -{ - assert(len >= *off); - - const unsigned char* p = (unsigned char*)data + *off; - const unsigned char* const pe = (unsigned char*)data + len; - const void* n = NULL; - - unsigned int trail = ctx->trail; - unsigned int cs = ctx->cs; - unsigned int top = ctx->top; - unpack_stack* stack = ctx->stack; - /* - unsigned int stack_size = ctx->stack_size; - */ - unpack_user* user = &ctx->user; - - PyObject* obj = NULL; - unpack_stack* c = NULL; - - int ret; - -#define construct_cb(name) \ - construct && unpack_callback ## name - -#define push_simple_value(func) \ - if(construct_cb(func)(user, &obj) < 0) { goto _failed; } \ - goto _push -#define push_fixed_value(func, arg) \ - if(construct_cb(func)(user, arg, &obj) < 0) { goto _failed; } \ - goto _push -#define push_variable_value(func, base, pos, len) \ - if(construct_cb(func)(user, \ - (const char*)base, (const char*)pos, len, &obj) < 0) { goto _failed; } \ - goto _push - -#define again_fixed_trail(_cs, trail_len) \ - trail = trail_len; \ - cs = _cs; \ - goto _fixed_trail_again -#define again_fixed_trail_if_zero(_cs, trail_len, ifzero) \ - trail = trail_len; \ - if(trail == 0) { goto ifzero; } \ - cs = _cs; \ - goto _fixed_trail_again - -#define start_container(func, count_, ct_) \ - if(top >= MSGPACK_EMBED_STACK_SIZE) { goto _failed; } /* FIXME */ \ - if(construct_cb(func)(user, count_, &stack[top].obj) < 0) { goto _failed; } \ - if((count_) == 0) { obj = stack[top].obj; \ - if (construct_cb(func##_end)(user, &obj) < 0) { goto _failed; } \ - goto _push; } \ - stack[top].ct = ct_; \ - stack[top].size = count_; \ - stack[top].count = 0; \ - ++top; \ - /*printf("container %d count %d stack %d\n",stack[top].obj,count_,top);*/ \ - /*printf("stack push %d\n", top);*/ \ - /* FIXME \ - if(top >= stack_size) { \ - if(stack_size == MSGPACK_EMBED_STACK_SIZE) { \ - size_t csize = sizeof(unpack_stack) * MSGPACK_EMBED_STACK_SIZE; \ - size_t nsize = csize * 2; \ - unpack_stack* tmp = (unpack_stack*)malloc(nsize); \ - if(tmp == NULL) { goto _failed; } \ - memcpy(tmp, ctx->stack, csize); \ - ctx->stack = stack = tmp; \ - ctx->stack_size = stack_size = MSGPACK_EMBED_STACK_SIZE * 2; \ - } else { \ - size_t nsize = sizeof(unpack_stack) * ctx->stack_size * 2; \ - unpack_stack* tmp = (unpack_stack*)realloc(ctx->stack, nsize); \ - if(tmp == NULL) { goto _failed; } \ - ctx->stack = stack = tmp; \ - ctx->stack_size = stack_size = stack_size * 2; \ - } \ - } \ - */ \ - goto _header_again - -#define NEXT_CS(p) ((unsigned int)*p & 0x1f) - -#ifdef USE_CASE_RANGE -#define SWITCH_RANGE_BEGIN switch(*p) { -#define SWITCH_RANGE(FROM, TO) case FROM ... TO: -#define SWITCH_RANGE_DEFAULT default: -#define SWITCH_RANGE_END } -#else -#define SWITCH_RANGE_BEGIN { if(0) { -#define SWITCH_RANGE(FROM, TO) } else if(FROM <= *p && *p <= TO) { -#define SWITCH_RANGE_DEFAULT } else { -#define SWITCH_RANGE_END } } -#endif - - if(p == pe) { goto _out; } - do { - switch(cs) { - case CS_HEADER: - SWITCH_RANGE_BEGIN - SWITCH_RANGE(0x00, 0x7f) // Positive Fixnum - push_fixed_value(_uint8, *(uint8_t*)p); - SWITCH_RANGE(0xe0, 0xff) // Negative Fixnum - push_fixed_value(_int8, *(int8_t*)p); - SWITCH_RANGE(0xc0, 0xdf) // Variable - switch(*p) { - case 0xc0: // nil - push_simple_value(_nil); - //case 0xc1: // never used - case 0xc2: // false - push_simple_value(_false); - case 0xc3: // true - push_simple_value(_true); - case 0xc4: // bin 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xc5: // bin 16 - again_fixed_trail(NEXT_CS(p), 2); - case 0xc6: // bin 32 - again_fixed_trail(NEXT_CS(p), 4); - case 0xc7: // ext 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xc8: // ext 16 - again_fixed_trail(NEXT_CS(p), 2); - case 0xc9: // ext 32 - again_fixed_trail(NEXT_CS(p), 4); - case 0xca: // float - case 0xcb: // double - case 0xcc: // unsigned int 8 - case 0xcd: // unsigned int 16 - case 0xce: // unsigned int 32 - case 0xcf: // unsigned int 64 - case 0xd0: // signed int 8 - case 0xd1: // signed int 16 - case 0xd2: // signed int 32 - case 0xd3: // signed int 64 - again_fixed_trail(NEXT_CS(p), 1 << (((unsigned int)*p) & 0x03)); - case 0xd4: // fixext 1 - case 0xd5: // fixext 2 - case 0xd6: // fixext 4 - case 0xd7: // fixext 8 - again_fixed_trail_if_zero(ACS_EXT_VALUE, - (1 << (((unsigned int)*p) & 0x03))+1, - _ext_zero); - case 0xd8: // fixext 16 - again_fixed_trail_if_zero(ACS_EXT_VALUE, 16+1, _ext_zero); - case 0xd9: // str 8 - again_fixed_trail(NEXT_CS(p), 1); - case 0xda: // raw 16 - case 0xdb: // raw 32 - case 0xdc: // array 16 - case 0xdd: // array 32 - case 0xde: // map 16 - case 0xdf: // map 32 - again_fixed_trail(NEXT_CS(p), 2 << (((unsigned int)*p) & 0x01)); - default: - goto _failed; - } - SWITCH_RANGE(0xa0, 0xbf) // FixRaw - again_fixed_trail_if_zero(ACS_RAW_VALUE, ((unsigned int)*p & 0x1f), _raw_zero); - SWITCH_RANGE(0x90, 0x9f) // FixArray - start_container(_array, ((unsigned int)*p) & 0x0f, CT_ARRAY_ITEM); - SWITCH_RANGE(0x80, 0x8f) // FixMap - start_container(_map, ((unsigned int)*p) & 0x0f, CT_MAP_KEY); - - SWITCH_RANGE_DEFAULT - goto _failed; - SWITCH_RANGE_END - // end CS_HEADER - - - _fixed_trail_again: - ++p; - - default: - if((size_t)(pe - p) < trail) { goto _out; } - n = p; p += trail - 1; - switch(cs) { - case CS_EXT_8: - again_fixed_trail_if_zero(ACS_EXT_VALUE, *(uint8_t*)n+1, _ext_zero); - case CS_EXT_16: - again_fixed_trail_if_zero(ACS_EXT_VALUE, - _msgpack_load16(uint16_t,n)+1, - _ext_zero); - case CS_EXT_32: - again_fixed_trail_if_zero(ACS_EXT_VALUE, - _msgpack_load32(uint32_t,n)+1, - _ext_zero); - case CS_FLOAT: { - union { uint32_t i; float f; } mem; - mem.i = _msgpack_load32(uint32_t,n); - push_fixed_value(_float, mem.f); } - case CS_DOUBLE: { - union { uint64_t i; double f; } mem; - mem.i = _msgpack_load64(uint64_t,n); -#if defined(__arm__) && !(__ARM_EABI__) // arm-oabi - // https://github.com/msgpack/msgpack-perl/pull/1 - mem.i = (mem.i & 0xFFFFFFFFUL) << 32UL | (mem.i >> 32UL); -#endif - push_fixed_value(_double, mem.f); } - case CS_UINT_8: - push_fixed_value(_uint8, *(uint8_t*)n); - case CS_UINT_16: - push_fixed_value(_uint16, _msgpack_load16(uint16_t,n)); - case CS_UINT_32: - push_fixed_value(_uint32, _msgpack_load32(uint32_t,n)); - case CS_UINT_64: - push_fixed_value(_uint64, _msgpack_load64(uint64_t,n)); - - case CS_INT_8: - push_fixed_value(_int8, *(int8_t*)n); - case CS_INT_16: - push_fixed_value(_int16, _msgpack_load16(int16_t,n)); - case CS_INT_32: - push_fixed_value(_int32, _msgpack_load32(int32_t,n)); - case CS_INT_64: - push_fixed_value(_int64, _msgpack_load64(int64_t,n)); - - case CS_BIN_8: - again_fixed_trail_if_zero(ACS_BIN_VALUE, *(uint8_t*)n, _bin_zero); - case CS_BIN_16: - again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load16(uint16_t,n), _bin_zero); - case CS_BIN_32: - again_fixed_trail_if_zero(ACS_BIN_VALUE, _msgpack_load32(uint32_t,n), _bin_zero); - case ACS_BIN_VALUE: - _bin_zero: - push_variable_value(_bin, data, n, trail); - - case CS_RAW_8: - again_fixed_trail_if_zero(ACS_RAW_VALUE, *(uint8_t*)n, _raw_zero); - case CS_RAW_16: - again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load16(uint16_t,n), _raw_zero); - case CS_RAW_32: - again_fixed_trail_if_zero(ACS_RAW_VALUE, _msgpack_load32(uint32_t,n), _raw_zero); - case ACS_RAW_VALUE: - _raw_zero: - push_variable_value(_raw, data, n, trail); - - case ACS_EXT_VALUE: - _ext_zero: - push_variable_value(_ext, data, n, trail); - - case CS_ARRAY_16: - start_container(_array, _msgpack_load16(uint16_t,n), CT_ARRAY_ITEM); - case CS_ARRAY_32: - /* FIXME security guard */ - start_container(_array, _msgpack_load32(uint32_t,n), CT_ARRAY_ITEM); - - case CS_MAP_16: - start_container(_map, _msgpack_load16(uint16_t,n), CT_MAP_KEY); - case CS_MAP_32: - /* FIXME security guard */ - start_container(_map, _msgpack_load32(uint32_t,n), CT_MAP_KEY); - - default: - goto _failed; - } - } - -_push: - if(top == 0) { goto _finish; } - c = &stack[top-1]; - switch(c->ct) { - case CT_ARRAY_ITEM: - if(construct_cb(_array_item)(user, c->count, &c->obj, obj) < 0) { goto _failed; } - if(++c->count == c->size) { - obj = c->obj; - if (construct_cb(_array_end)(user, &obj) < 0) { goto _failed; } - --top; - /*printf("stack pop %d\n", top);*/ - goto _push; - } - goto _header_again; - case CT_MAP_KEY: - c->map_key = obj; - c->ct = CT_MAP_VALUE; - goto _header_again; - case CT_MAP_VALUE: - if(construct_cb(_map_item)(user, c->count, &c->obj, c->map_key, obj) < 0) { goto _failed; } - if(++c->count == c->size) { - obj = c->obj; - if (construct_cb(_map_end)(user, &obj) < 0) { goto _failed; } - --top; - /*printf("stack pop %d\n", top);*/ - goto _push; - } - c->ct = CT_MAP_KEY; - goto _header_again; - - default: - goto _failed; - } - -_header_again: - cs = CS_HEADER; - ++p; - } while(p != pe); - goto _out; - - -_finish: - if (!construct) - unpack_callback_nil(user, &obj); - stack[0].obj = obj; - ++p; - ret = 1; - /*printf("-- finish --\n"); */ - goto _end; - -_failed: - /*printf("** FAILED **\n"); */ - ret = -1; - goto _end; - -_out: - ret = 0; - goto _end; - -_end: - ctx->cs = cs; - ctx->trail = trail; - ctx->top = top; - *off = p - (const unsigned char*)data; - - return ret; -#undef construct_cb -} - -#undef SWITCH_RANGE_BEGIN -#undef SWITCH_RANGE -#undef SWITCH_RANGE_DEFAULT -#undef SWITCH_RANGE_END -#undef push_simple_value -#undef push_fixed_value -#undef push_variable_value -#undef again_fixed_trail -#undef again_fixed_trail_if_zero -#undef start_container - -template -static inline int unpack_container_header(unpack_context* ctx, const char* data, size_t len, size_t* off) -{ - assert(len >= *off); - uint32_t size; - const unsigned char *const p = (unsigned char*)data + *off; - -#define inc_offset(inc) \ - if (len - *off < inc) \ - return 0; \ - *off += inc; - - switch (*p) { - case var_offset: - inc_offset(3); - size = _msgpack_load16(uint16_t, p + 1); - break; - case var_offset + 1: - inc_offset(5); - size = _msgpack_load32(uint32_t, p + 1); - break; -#ifdef USE_CASE_RANGE - case fixed_offset + 0x0 ... fixed_offset + 0xf: -#else - case fixed_offset + 0x0: - case fixed_offset + 0x1: - case fixed_offset + 0x2: - case fixed_offset + 0x3: - case fixed_offset + 0x4: - case fixed_offset + 0x5: - case fixed_offset + 0x6: - case fixed_offset + 0x7: - case fixed_offset + 0x8: - case fixed_offset + 0x9: - case fixed_offset + 0xa: - case fixed_offset + 0xb: - case fixed_offset + 0xc: - case fixed_offset + 0xd: - case fixed_offset + 0xe: - case fixed_offset + 0xf: -#endif - ++*off; - size = ((unsigned int)*p) & 0x0f; - break; - default: - PyErr_SetString(PyExc_ValueError, "Unexpected type header on stream"); - return -1; - } - unpack_callback_uint32(&ctx->user, size, &ctx->stack[0].obj); - return 1; -} - -#undef SWITCH_RANGE_BEGIN -#undef SWITCH_RANGE -#undef SWITCH_RANGE_DEFAULT -#undef SWITCH_RANGE_END - -static const execute_fn unpack_construct = &unpack_execute; -static const execute_fn unpack_skip = &unpack_execute; -static const execute_fn read_array_header = &unpack_container_header<0x90, 0xdc>; -static const execute_fn read_map_header = &unpack_container_header<0x80, 0xde>; - -#undef NEXT_CS - -/* vim: set ts=4 sw=4 sts=4 expandtab */ diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 0a767dd27b658..7fbe7a04d5b22 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -11,8 +11,6 @@ The full license is in the LICENSE file, distributed with this software. #define PANDAS__LIBS_SRC_PARSE_HELPER_H_ #include -#include "inline_helper.h" -#include "headers/portable.h" #include "parser/tokenizer.h" int to_double(char *item, double *p_value, char sci, char decimal, @@ -94,12 +92,4 @@ int floatify(PyObject *str, double *result, int *maybe_int) { return -1; } -PANDAS_INLINE void lowercase(char *p) { - for (; *p; ++p) *p = tolower_ascii(*p); -} - -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - #endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_ diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c index 5d73230f32955..1e3295fcb6fc7 100644 --- a/pandas/_libs/src/parser/io.c +++ b/pandas/_libs/src/parser/io.c @@ -9,7 +9,6 @@ The full license is in the LICENSE file, distributed with this software. #include "io.h" -#include #include #include @@ -17,7 +16,7 @@ The full license is in the LICENSE file, distributed with this software. #define O_BINARY 0 #endif // O_BINARY -#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32) +#ifdef _WIN32 #define USE_WIN_UTF16 #include #endif diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2752fb6424022..2188ff6b0d464 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -25,19 +25,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "../headers/portable.h" -static void *safe_realloc(void *buffer, size_t size) { - void *result; - // OSX is weird. - // http://stackoverflow.com/questions/9560609/ - // different-realloc-behaviour-in-linux-and-osx - - result = realloc(buffer, size); - TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, - result)) - - return result; -} - void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { // column i, starting at 0 self->words = parser->words; @@ -45,18 +32,6 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) { self->line_start = parser->line_start + start; } -coliter_t *coliter_new(parser_t *self, int i) { - // column i, starting at 0 - coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t)); - - if (NULL == iter) { - return NULL; - } - - coliter_setup(iter, self, i, 0); - return iter; -} - static void free_if_not_null(void **ptr) { TRACE(("free_if_not_null %p\n", *ptr)) if (*ptr != NULL) { @@ -80,7 +55,7 @@ static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity, while ((length + space >= cap) && (newbuffer != NULL)) { cap = cap ? cap << 1 : 2; buffer = newbuffer; - newbuffer = safe_realloc(newbuffer, elsize * cap); + newbuffer = realloc(newbuffer, elsize * cap); } if (newbuffer == NULL) { @@ -321,8 +296,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { ("make_stream_space: cap != self->words_cap, nbytes = %d, " "self->words_cap=%d\n", nbytes, self->words_cap)) - newptr = safe_realloc((void *)self->word_starts, - sizeof(int64_t) * self->words_cap); + newptr = realloc((void *)self->word_starts, + sizeof(int64_t) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -349,8 +324,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { if (cap != self->lines_cap) { TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) - newptr = safe_realloc((void *)self->line_fields, - sizeof(int64_t) * self->lines_cap); + newptr = realloc((void *)self->line_fields, + sizeof(int64_t) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -427,7 +402,7 @@ static void append_warning(parser_t *self, const char *msg) { snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); - newptr = safe_realloc(self->warn_msg, ex_length + length + 1); + newptr = realloc(self->warn_msg, ex_length + length + 1); if (newptr != NULL) { self->warn_msg = (char *)newptr; snprintf(self->warn_msg + ex_length, length + 1, "%s", msg); @@ -672,8 +647,6 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define END_LINE() END_LINE_STATE(START_RECORD) -#define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) - #define IS_TERMINATOR(c) \ (c == line_terminator) @@ -692,7 +665,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { // applied when in a field #define IS_DELIMITER(c) \ ((!self->delim_whitespace && c == self->delimiter) || \ - (self->delim_whitespace && IS_WHITESPACE(c))) + (self->delim_whitespace && isblank(c))) #define _TOKEN_CLEANUP() \ self->stream_len = slen; \ @@ -843,7 +816,7 @@ int tokenize_bytes(parser_t *self, self->state = EAT_CRNL_NOP; break; } else if (!self->delim_whitespace) { - if (IS_WHITESPACE(c) && c != self->delimiter) { + if (isblank(c) && c != self->delimiter) { } else { // backtrack // use i + 1 because buf has been incremented but not i do { @@ -873,7 +846,7 @@ int tokenize_bytes(parser_t *self, } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_COMMENT; break; - } else if (!IS_WHITESPACE(c)) { + } else if (!isblank(c)) { self->state = START_FIELD; // fall through to subsequent state } else { @@ -917,7 +890,7 @@ int tokenize_bytes(parser_t *self, } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_LINE_COMMENT; break; - } else if (IS_WHITESPACE(c)) { + } else if (isblank(c)) { if (self->delim_whitespace) { if (self->skip_empty_lines) { self->state = WHITESPACE_LINE; @@ -1290,13 +1263,13 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = safe_realloc((void *)self->words, new_cap * sizeof(char *)); + newptr = realloc((void *)self->words, new_cap * sizeof(char *)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->words = (char **)newptr; } - newptr = safe_realloc((void *)self->word_starts, + newptr = realloc((void *)self->word_starts, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1315,13 +1288,13 @@ int parser_trim_buffers(parser_t *self) { if (new_cap < self->stream_cap) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " - "safe_realloc\n")); - newptr = safe_realloc((void *)self->stream, new_cap); + "realloc\n")); + newptr = realloc((void *)self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { // Update the pointers in the self->words array (char **) if - // `safe_realloc` + // `realloc` // moved the `self->stream` buffer. This block mirrors a similar // block in // `make_stream_space`. @@ -1342,14 +1315,14 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = safe_realloc((void *)self->line_start, + newptr = realloc((void *)self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { self->line_start = (int64_t *)newptr; } - newptr = safe_realloc((void *)self->line_fields, + newptr = realloc((void *)self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1426,42 +1399,30 @@ int tokenize_all_rows(parser_t *self) { return status; } -PANDAS_INLINE void uppercase(char *p) { - for (; *p; ++p) *p = toupper_ascii(*p); -} - +/* + * Function: to_boolean + * -------------------- + * + * Validate if item should be recognized as a boolean field. + * + * item: const char* representing parsed text + * val : pointer to a uint8_t of boolean representation + * + * If item is determined to be boolean, this method will set + * the appropriate value of val and return 0. A non-zero exit + * status means that item was not inferred to be boolean, and + * leaves the value of *val unmodified. + */ int to_boolean(const char *item, uint8_t *val) { - char *tmp; - int i, status = 0; - size_t length0 = (strlen(item) + 1); - int bufsize = length0; - - static const char *tstrs[1] = {"TRUE"}; - static const char *fstrs[1] = {"FALSE"}; - - tmp = malloc(bufsize); - snprintf(tmp, length0, "%s", item); - uppercase(tmp); - - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, tstrs[i]) == 0) { - *val = 1; - goto done; - } - } - - for (i = 0; i < 1; ++i) { - if (strcmp(tmp, fstrs[i]) == 0) { - *val = 0; - goto done; - } + if (strcasecmp(item, "TRUE") == 0) { + *val = 1; + return 0; + } else if (strcasecmp(item, "FALSE") == 0) { + *val = 0; + return 0; } - status = -1; - -done: - free(tmp); - return status; + return -1; } // --------------------------------------------------------------------------- @@ -1813,11 +1774,18 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // This is called from a nogil block in parsers.pyx + // so need to explicitly get GIL before Python calls + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + double r = PyOS_string_to_double(p, q, 0); if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); + + PyGILState_Release(gstate); return r; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 66ef1887d6bc3..4fd2065c07100 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -15,13 +15,13 @@ See LICENSE for the license #define PY_SSIZE_T_CLEAN #include -#define ERROR_OK 0 #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 #include "../headers/stdint.h" #include "../inline_helper.h" +#include "../headers/portable.h" #include "khash.h" @@ -31,10 +31,6 @@ See LICENSE for the license #define CALLING_READ_FAILED 2 -#if defined(_MSC_VER) -#define strtoll _strtoi64 -#endif // _MSC_VER - /* C flat file parsing low level code for pandas / NumPy @@ -159,11 +155,8 @@ typedef struct parser_t { PyObject *skipfunc; int64_t skip_first_N_rows; int64_t skip_footer; - // pick one, depending on whether the converter requires GIL - double (*double_converter_nogil)(const char *, char **, - char, char, char, int); - double (*double_converter_withgil)(const char *, char **, - char, char, char, int); + double (*double_converter)(const char *, char **, + char, char, char, int, int *, int *); // error handling char *warn_msg; @@ -179,7 +172,6 @@ typedef struct coliter_t { } coliter_t; void coliter_setup(coliter_t *self, parser_t *parser, int i, int start); -coliter_t *coliter_new(parser_t *self, int i); #define COLITER_NEXT(iter, word) \ do { \ @@ -231,6 +223,8 @@ double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); + +// GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); int to_boolean(const char *item, uint8_t *val); diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 05c3ae4096ad5..8d04874b4c9bf 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -154,6 +154,8 @@ enum JSTYPES { JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; typedef void * JSOBJ; @@ -290,6 +292,8 @@ typedef struct __JSONObjectDecoder { JSOBJ (*newTrue)(void *prv); JSOBJ (*newFalse)(void *prv); JSOBJ (*newNull)(void *prv); + JSOBJ (*newPosInf)(void *prv); + JSOBJ (*newNegInf)(void *prv); JSOBJ (*newObject)(void *prv, void *decoder); JSOBJ (*endObject)(void *prv, JSOBJ obj); JSOBJ (*newArray)(void *prv, void *decoder); diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index a847b0f5d5102..4eb18ee13d70b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,9 +127,16 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') { + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { offset++; intNeg = -1; + if (*(offset) == 'I') { + goto DECODE_INF; + } overflowLimit = LLONG_MIN; } @@ -150,7 +157,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { case '7': case '8': case '9': { - // FIXME: Check for arithemtic overflow here + // FIXME: Check for arithmetic overflow here // PERF: Don't do 64-bit arithmetic here unless we know we have // to intValue = intValue * 10ULL + (JSLONG)(chr - 48); @@ -235,7 +242,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } BREAK_FRC_LOOP: - // FIXME: Check for arithemtic overflow here + // FIXME: Check for arithmetic overflow here ds->lastType = JT_DOUBLE; ds->start = offset; return ds->dec->newDouble( @@ -281,8 +288,50 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } } +DECODE_NAN: + offset++; + if (*(offset++) != 'a') goto SET_NAN_ERROR; + if (*(offset++) != 'N') goto SET_NAN_ERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); + +SET_NAN_ERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + +DECODE_INF: + offset++; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'f') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 't') goto SET_INF_ERROR; + if (*(offset++) != 'y') goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } + +SET_INF_ERROR: + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } + + BREAK_EXP_LOOP: - // FIXME: Check for arithemtic overflow here + // FIXME: Check for arithmetic overflow here ds->lastType = JT_DOUBLE; ds->start = offset; return ds->dec->newDouble( @@ -1070,6 +1119,8 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { case '7': case '8': case '9': + case 'I': + case 'N': case '-': return decode_numeric(ds); diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 7a2e5a584443a..b2fc788478864 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -459,6 +459,10 @@ JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); } + +JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); } + JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } @@ -502,10 +506,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectDecoder dec = { Object_newString, Object_objectAddKey, Object_arrayAddItem, Object_newTrue, Object_newFalse, Object_newNull, - Object_newObject, Object_endObject, Object_newArray, - Object_endArray, Object_newInteger, Object_newLong, - Object_newDouble, Object_releaseObject, PyObject_Malloc, - PyObject_Free, PyObject_Realloc}; + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newDouble, + Object_releaseObject, PyObject_Malloc, PyObject_Free, + PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 48712dc68829d..c413a16f8d5f0 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -54,13 +54,12 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; -PyObject *cls_timestamp; PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, - void *outValue, size_t *_outLen); +typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); typedef struct __NpyArrContext { PyObject *array; @@ -94,7 +93,7 @@ typedef struct __TypeContext { JSPFN_ITERNEXT iterNext; JSPFN_ITERGETNAME iterGetName; JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToJSON PyTypeToJSON; + PFN_PyTypeToUTF8 PyTypeToUTF8; PyObject *newObj; PyObject *dictObj; Py_ssize_t index; @@ -166,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -178,9 +176,8 @@ void *initObjToJSON(void) { Py_DECREF(mod_nattype); } - /* Initialise numpy API and use 2/3 compatible return */ + /* Initialise numpy API */ import_array(); - return NUMPY_IMPORT_ARRAY_RETVAL; } static TypeContext *createTypeContext(void) { @@ -212,85 +209,70 @@ static TypeContext *createTypeContext(void) { return pc; } -static int is_sparse_array(PyObject *obj) { - // TODO can be removed again once SparseArray.values is removed (GH26421) - if (PyObject_HasAttrString(obj, "_subtyp")) { - PyObject *_subtype = PyObject_GetAttrString(obj, "_subtyp"); - PyObject *sparse_array = PyUnicode_FromString("sparse_array"); - int ret = PyUnicode_Compare(_subtype, sparse_array); - - if (ret == 0) { - return 1; - } +/* + * Function: scaleNanosecToUnit + * ----------------------------- + * + * Scales an integer value representing time in nanoseconds to provided unit. + * + * Mutates the provided value directly. Returns 0 on success, non-zero on error. + */ +static int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { + switch (unit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + *value /= 1000LL; + break; + case NPY_FR_ms: + *value /= 1000000LL; + break; + case NPY_FR_s: + *value /= 1000000000LL; + break; + default: + return -1; } + return 0; } static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; - if (!is_sparse_array(obj)) { - values = PyObject_GetAttrString(obj, "values"); - PRINTMARK(); - } - - if (values && !PyArray_CheckExact(values)) { - - if (PyObject_HasAttrString(values, "to_numpy")) { - values = PyObject_CallMethod(values, "to_numpy", NULL); - } - - if (!is_sparse_array(values) && - PyObject_HasAttrString(values, "values")) { - PyObject *subvals = get_values(values); - PyErr_Clear(); - PRINTMARK(); - // subvals are sometimes missing a dimension - if (subvals) { - PyArrayObject *reshape = (PyArrayObject *)subvals; - PyObject *shape = PyObject_GetAttrString(obj, "shape"); - PyArray_Dims dims; - PRINTMARK(); - - if (!shape || !PyArray_IntpConverter(shape, &dims)) { - subvals = NULL; - } else { - subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); - PyDimMem_FREE(dims.ptr); - } - Py_DECREF(reshape); - Py_XDECREF(shape); - } - Py_DECREF(values); - values = subvals; - } else { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } + PRINTMARK(); - if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { + if (PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values && PyObject_HasAttrString(obj, "get_block_values")) { + if ((values == NULL) && PyObject_HasAttrString(obj, "get_block_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "get_block_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values) { + if (values == NULL) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; PRINTMARK(); @@ -385,161 +367,129 @@ static PyObject *get_item(PyObject *obj, Py_ssize_t i) { return ret; } -static void *CDouble(JSOBJ obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PRINTMARK(); - *((double *)outValue) = GET_TC(tc)->doubleValue; - return NULL; -} - -static void *CLong(JSOBJ obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PRINTMARK(); - *((JSINT64 *)outValue) = GET_TC(tc)->longValue; - return NULL; -} - -#ifdef _LP64 -static void *PyIntToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT64 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#else -static void *PyIntToINT32(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((JSINT32 *)outValue) = PyLong_AsLong(obj); - return NULL; -} -#endif - -static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { - *((JSINT64 *)outValue) = GET_TC(tc)->longValue; - return NULL; -} - -static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { PyObject *obj = (PyObject *)_obj; - PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); - return NULL; + *_outLen = PyBytes_GET_SIZE(obj); + return PyBytes_AS_STRING(obj); } -static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *((double *)outValue) = PyFloat_AsDouble(obj); - return NULL; + return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); } -static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj = (PyObject *)_obj; - *_outLen = PyBytes_GET_SIZE(obj); - return PyBytes_AS_STRING(obj); -} +/* Converts the int64_t representation of a datetime to ISO; mutates len */ +static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { + npy_datetimestruct dts; + int ret_code; -static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { - PyObject *obj, *newObj; - obj = (PyObject *)_obj; + pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); - if (PyUnicode_IS_COMPACT_ASCII(obj)) { - Py_ssize_t len; - char *data = (char *)PyUnicode_AsUTF8AndSize(obj, &len); - *_outLen = len; - return data; - } + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); - newObj = PyUnicode_AsUTF8String(obj); + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } - GET_TC(tc)->newObj = newObj; + ret_code = make_iso_8601_datetime(&dts, result, *len, base); + if (ret_code != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + } - *_outLen = PyBytes_GET_SIZE(newObj); - return PyBytes_AS_STRING(newObj); + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } -static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, - JSONTypeContext *tc, void *outValue, - size_t *_outLen) { +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return int64ToIso(GET_TC(tc)->longValue, base, len); +} - if (((PyObjectEncoder *)tc->encoder)->datetimeIso) { - PRINTMARK(); - *_outLen = (size_t)get_datetime_iso_8601_strlen(0, base); - GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } +static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { + scaleNanosecToUnit(&dt, base); + return dt; +} - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, base)) { - PRINTMARK(); - *_outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; - } else { - PRINTMARK(); +/* Convert PyDatetime To ISO C-string. mutates len */ +static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - PyObject_Free(GET_TC(tc)->cStr); - return NULL; + "Could not convert PyDateTime to numpy datetime"); } - } else { + return NULL; + } + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + ret = make_iso_8601_datetime(&dts, result, *len, base); + + if (ret != 0) { PRINTMARK(); - *((JSINT64 *)outValue) = npy_datetimestruct_to_datetime(base, dts); + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); return NULL; } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } -static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, - void *outValue, size_t *_outLen) { - npy_datetimestruct dts; - PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *)_obj; - PRINTMARK(); - // TODO(anyone): Does not appear to be reached in tests. +/* JSON callback */ +static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { - pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + if (!PyDateTime_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected datetime object"); + return NULL; + } + + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return PyDateTimeToIso(obj, base, len); } -static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { +static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; - PyDateTime_Date *obj = (PyDateTime_Date *)_obj; + int ret; - PRINTMARK(); + if (!PyDateTime_Check(obj)) { + // TODO: raise TypeError + } + PyDateTime_Date *dt = (PyDateTime_Date *)obj; - if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { - PRINTMARK(); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); - } else { + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); + "Could not convert PyDateTime to numpy datetime"); } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; + // TODO: is setting errMsg required? + //((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + // return NULL; } -} - -static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, - void *outValue, size_t *_outLen) { - npy_datetimestruct dts; - PRINTMARK(); - pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, - NPY_FR_ns, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + return NpyDateTimeToEpoch(npy_dt, base); } -static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *outLen) { +static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str; PyObject *tmp; @@ -563,81 +513,15 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, GET_TC(tc)->newObj = str; *outLen = PyBytes_GET_SIZE(str); - outValue = (void *)PyBytes_AS_STRING(str); + char *outValue = PyBytes_AS_STRING(str); return outValue; } -static int NpyTypeToJSONType(PyObject *obj, JSONTypeContext *tc, int npyType, - void *value) { - PyArray_VectorUnaryFunc *castfunc; - npy_double doubleVal; - npy_int64 longVal; - - if (PyTypeNum_ISFLOAT(npyType)) { - PRINTMARK(); - castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_DOUBLE); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, - "Cannot cast numpy dtype %d to double", npyType); - } - castfunc(value, &doubleVal, 1, NULL, NULL); - if (npy_isnan(doubleVal) || npy_isinf(doubleVal)) { - PRINTMARK(); - return JT_NULL; - } - GET_TC(tc)->doubleValue = (double)doubleVal; - GET_TC(tc)->PyTypeToJSON = CDouble; - return JT_DOUBLE; - } - - if (PyTypeNum_ISDATETIME(npyType)) { - PRINTMARK(); - castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - npyType); - } - castfunc(value, &longVal, 1, NULL, NULL); - if (longVal == get_nat()) { - PRINTMARK(); - return JT_NULL; - } - GET_TC(tc)->longValue = (JSINT64)longVal; - GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON; - return ((PyObjectEncoder *)tc->encoder)->datetimeIso ? JT_UTF8 - : JT_LONG; - } - - if (PyTypeNum_ISINTEGER(npyType)) { - PRINTMARK(); - castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - npyType); - } - castfunc(value, &longVal, 1, NULL, NULL); - GET_TC(tc)->longValue = (JSINT64)longVal; - GET_TC(tc)->PyTypeToJSON = CLong; - return JT_LONG; - } - - if (PyTypeNum_ISBOOL(npyType)) { - PRINTMARK(); - return *((npy_bool *)value) == NPY_TRUE ? JT_TRUE : JT_FALSE; - } - - PRINTMARK(); - return JT_INVALID; -} - //============================================================================= // Numpy array iteration functions //============================================================================= -static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) { +static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { if (GET_TC(tc)->npyarr && GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { PRINTMARK(); @@ -646,7 +530,9 @@ static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) { } } -int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) { return 0; } +int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) { + return 0; +} void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { PyArrayObject *obj; @@ -703,7 +589,10 @@ void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } } -void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } +void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc)) { + PRINTMARK(); +} void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; @@ -782,12 +671,13 @@ int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { return 1; } -JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); return GET_TC(tc)->itemValue; } -char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); @@ -841,7 +731,8 @@ int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; @@ -863,7 +754,7 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return cStr; } -char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, +char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; @@ -909,7 +800,7 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; PRINTMARK(); @@ -1141,13 +1032,14 @@ int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} +void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1160,7 +1052,7 @@ void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->iterator = PyObject_GetIter(obj); } -int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) { +int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject *item; if (GET_TC(tc)->itemValue) { @@ -1178,7 +1070,7 @@ int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; @@ -1190,11 +1082,12 @@ void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } } -JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Iter_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Iter_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1210,7 +1103,7 @@ void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } -void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; @@ -1296,12 +1189,13 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { return 1; } -JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); return GET_TC(tc)->itemValue; } -char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { PRINTMARK(); *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); @@ -1327,20 +1221,21 @@ int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} +void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } //============================================================================= // pandas Index iteration functions //============================================================================= -void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); if (!GET_TC(tc)->cStr) { @@ -1376,13 +1271,16 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } +void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) { + PRINTMARK(); +} -JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1390,7 +1288,7 @@ char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas Series iteration functions //============================================================================= -void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); @@ -1431,17 +1329,18 @@ int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; PRINTMARK(); } -JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1449,7 +1348,7 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas DataFrame iteration functions //============================================================================= -void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); @@ -1495,17 +1394,18 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; PRINTMARK(); } -JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1515,12 +1415,12 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { // itemName might converted to string (Python_Str). Do refCounting // itemValue is borrowed from object (which is dict). No refCounting //============================================================================= -void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; PRINTMARK(); } -int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) { +int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject *itemNameTmp; if (GET_TC(tc)->itemName) { @@ -1548,7 +1448,7 @@ int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemName) { Py_DECREF(GET_TC(tc)->itemName); GET_TC(tc)->itemName = NULL; @@ -1557,11 +1457,12 @@ void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } -JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1598,7 +1499,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. PyObject *item = NULL; - npy_intp i, stride, len; + size_t len; + npy_intp i, stride; char **ret; char *dataptr, *cLabel; int type_num; @@ -1639,8 +1541,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // TODO: for any matches on type_num (date and timedeltas) should use a - // vectorized solution to convert to epoch or iso formats + // TODO: vectorized timedelta solution if (enc->datetimeIso && (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); @@ -1663,68 +1564,39 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = (char *)PyUnicode_AsUTF8(iso); Py_DECREF(iso); len = strlen(cLabel); - } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || - PyDate_Check(item)) { - PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); - if (ts == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + } else if (PyTypeNum_ISDATETIME(type_num)) { + NPY_DATETIMEUNIT base = enc->datetimeUnit; + npy_int64 longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); } - + castfunc(dataptr, &longVal, 1, NULL, NULL); if (enc->datetimeIso) { - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + cLabel = int64ToIso(longVal, base, &len); + } else { + if (!scaleNanosecToUnit(&longVal, base)) { + // TODO: This gets hit but somehow doesn't cause errors + // need to clean up (elsewhere in module as well) } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_INT64_FMT, longVal); len = strlen(cLabel); + } + } else if (PyDateTime_Check(item) || PyDate_Check(item)) { + NPY_DATETIMEUNIT base = enc->datetimeUnit; + if (enc->datetimeIso) { + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); } else { - npy_int64 value; - // TODO: refactor to not duplicate what goes on in - // beginTypeContext - if (PyObject_HasAttrString(ts, "value")) { - PRINTMARK(); - value = get_long_attr(ts, "value"); - } else { - PRINTMARK(); - value = total_seconds(ts) * - 1000000000LL; // nanoseconds per second - } - Py_DECREF(ts); - - switch (enc->datetimeUnit) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; - default: - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - char buf[21] = {0}; // 21 chars for 2**63 as string - cLabel = buf; - sprintf(buf, "%" NPY_INT64_FMT, value); + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + PyDateTimeToEpoch(item, base)); len = strlen(cLabel); } - } else { // Fallack to string representation + } else { // Fallback to string representation PyObject *str = PyObject_Str(item); if (str == NULL) { Py_DECREF(item); @@ -1784,7 +1656,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PyObjectEncoder *enc; double val; npy_int64 value; - int base; + int unit; PRINTMARK(); tc->prv = NULL; @@ -1797,29 +1669,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { obj = (PyObject *)_obj; enc = (PyObjectEncoder *)tc->encoder; - if (enc->npyType >= 0) { - PRINTMARK(); - tc->prv = &(enc->basicTypeContext); - tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue); - - if (tc->type == JT_INVALID) { - if (enc->defaultHandler) { - enc->npyType = -1; - PRINTMARK(); - Object_invokeDefaultHandler( - enc->npyCtxtPassthru->getitem(enc->npyValue, - enc->npyCtxtPassthru->array), - enc); - } else { - PyErr_Format(PyExc_RuntimeError, "Unhandled numpy dtype %d", - enc->npyType); - } - } - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; - } - if (PyBool_Check(obj)) { PRINTMARK(); tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; @@ -1837,6 +1686,44 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } tc->prv = pc; + if (PyTypeNum_ISDATETIME(enc->npyType)) { + PRINTMARK(); + int64_t longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(enc->npyValue, &longVal, 1, NULL, NULL); + if (longVal == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + } else { + + if (enc->datetimeIso) { + PRINTMARK(); + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + // Currently no way to pass longVal to iso function, so use + // state management + GET_TC(tc)->longValue = longVal; + tc->type = JT_UTF8; + } else { + PRINTMARK(); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base); + tc->type = JT_LONG; + } + } + + // TODO: this prevents infinite loop with mixed-type DataFrames; + // refactor + enc->npyCtxtPassthru = NULL; + enc->npyType = -1; + return; + } + if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { PRINTMARK(); @@ -1845,7 +1732,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyLong_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; tc->type = JT_LONG; GET_TC(tc)->longValue = PyLong_AsLongLong(obj); @@ -1863,23 +1749,23 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (npy_isnan(val) || npy_isinf(val)) { tc->type = JT_NULL; } else { - pc->PyTypeToJSON = PyFloatToDOUBLE; + GET_TC(tc)->doubleValue = val; tc->type = JT_DOUBLE; } return; } else if (PyBytes_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyBytesToUTF8; + pc->PyTypeToUTF8 = PyBytesToUTF8; tc->type = JT_UTF8; return; } else if (PyUnicode_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyUnicodeToUTF8; + pc->PyTypeToUTF8 = PyUnicodeToUTF8; tc->type = JT_UTF8; return; } else if (PyObject_TypeCheck(obj, type_decimal)) { PRINTMARK(); - pc->PyTypeToJSON = PyFloatToDOUBLE; + GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); tc->type = JT_DOUBLE; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { @@ -1890,18 +1776,21 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } PRINTMARK(); - pc->PyTypeToJSON = PyDateTimeToJSON; if (enc->datetimeIso) { PRINTMARK(); + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { PRINTMARK(); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; } else if (PyTime_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyTimeToJSON; + pc->PyTypeToUTF8 = PyTimeToJSON; tc->type = JT_UTF8; return; } else if (PyArray_IsScalar(obj, Datetime)) { @@ -1913,8 +1802,17 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } PRINTMARK(); - pc->PyTypeToJSON = NpyDateTimeScalarToJSON; - tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG; + if (enc->datetimeIso) { + PRINTMARK(); + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + tc->type = JT_UTF8; + } else { + PRINTMARK(); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } return; } else if (PyDelta_Check(obj)) { if (PyObject_HasAttrString(obj, "value")) { @@ -1925,19 +1823,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - switch (base) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; + unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + if (scaleNanosecToUnit(&value, unit) != 0) { + // TODO: Add some kind of error handling here } exc = PyErr_Occurred(); @@ -1956,12 +1844,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = value; PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; tc->type = JT_LONG; return; } else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); - pc->PyTypeToJSON = PyLongToINT64; tc->type = JT_LONG; PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); @@ -1982,7 +1868,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { PRINTMARK(); - pc->PyTypeToJSON = NpyFloatToDOUBLE; + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue), + PyArray_DescrFromType(NPY_DOUBLE)); tc->type = JT_DOUBLE; return; } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { @@ -2304,7 +2191,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } -void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { +void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); if (tc->prv) { Py_XDECREF(GET_TC(tc)->newObj); @@ -2327,25 +2214,15 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - return GET_TC(tc)->PyTypeToJSON(obj, tc, NULL, _outLen); -} - -JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) { - JSINT64 ret; - GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); - return ret; + return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); } -JSINT32 Object_getIntValue(JSOBJ obj, JSONTypeContext *tc) { - JSINT32 ret; - GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); - return ret; +JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->longValue; } -double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) { - double ret; - GET_TC(tc)->PyTypeToJSON(obj, tc, &ret, NULL); - return ret; +double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { + return GET_TC(tc)->doubleValue; } static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } @@ -2370,7 +2247,8 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } -PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { +PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, + PyObject *kwargs) { static char *kwlist[] = {"obj", "ensure_ascii", "double_precision", @@ -2400,7 +2278,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { Object_endTypeContext, Object_getStringValue, Object_getLongValue, - Object_getIntValue, + NULL, // getIntValue is unused Object_getDoubleValue, Object_iterBegin, Object_iterNext, diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index 39320d73d0cab..4a88fb7a4e849 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -65,35 +65,15 @@ static PyMethodDef ujsonMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "_libjson", - 0, /* m_doc */ - -1, /* m_size */ - ujsonMethods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL /* m_free */ +static PyModuleDef moduledef = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "_libjson", + .m_methods = ujsonMethods }; -#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) -#define PYMODULE_CREATE() PyModule_Create(&moduledef) -#define MODINITERROR return NULL -PYMODINITFUNC { - PyObject *module; - PyObject *version_string; +PyMODINIT_FUNC PyInit_json(void) { + initObjToJSON(); // TODO: clean up, maybe via tp_free? + return PyModuleDef_Init(&moduledef); - initObjToJSON(); - module = PYMODULE_CREATE(); - - if (module == NULL) { - MODINITERROR; - } - - version_string = PyUnicode_FromString(UJSON_VERSION); - PyModule_AddObject(module, "__version__", version_string); - - return module; } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 7ad5ea189763c..5a30b71a6fea1 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -66,7 +66,8 @@ cpdef assert_almost_equal(a, b, check_less_precise=False, bint check_dtype=True, obj=None, lobj=None, robj=None): - """Check that left and right objects are almost equal. + """ + Check that left and right objects are almost equal. Parameters ---------- @@ -89,7 +90,6 @@ cpdef assert_almost_equal(a, b, Specify right object name being compared, internally used to show appropriate assertion message """ - cdef: int decimal double diff = 0.0 @@ -108,7 +108,7 @@ cpdef assert_almost_equal(a, b, return assert_dict_equal(a, b) if isinstance(a, str) or isinstance(b, str): - assert a == b, "%r != %r" % (a, b) + assert a == b, f"{a} != {b}" return True a_is_ndarray = isinstance(a, np.ndarray) @@ -123,24 +123,23 @@ cpdef assert_almost_equal(a, b, if isiterable(a): if not isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - assert has_length(a) and has_length(b), ( - "Can't compare objects without length, one or both is invalid: " - "(%r, %r)" % (a, b)) + assert has_length(a) and has_length(b), ("Can't compare objects without " + "length, one or both is invalid: " + f"({a}, {b})") if a_is_ndarray and b_is_ndarray: na, nb = a.size, b.size if a.shape != b.shape: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail raise_assert_detail( - obj, '{0} shapes are different'.format(obj), - a.shape, b.shape) + obj, f'{obj} shapes are different', a.shape, b.shape) if check_dtype and not is_dtype_equal(a.dtype, b.dtype): - from pandas.util.testing import assert_attr_equal + from pandas._testing import assert_attr_equal assert_attr_equal('dtype', a, b, obj=obj) if array_equivalent(a, b, strict_nan=True): @@ -150,7 +149,7 @@ cpdef assert_almost_equal(a, b, na, nb = len(a), len(b) if na != nb: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail # if we have a small diff set, print it if abs(na - nb) < 10: @@ -158,10 +157,9 @@ cpdef assert_almost_equal(a, b, else: r = None - raise_assert_detail(obj, '{0} length are different'.format(obj), - na, nb, r) + raise_assert_detail(obj, f"{obj} length are different", na, nb, r) - for i in xrange(len(a)): + for i in range(len(a)): try: assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) @@ -170,25 +168,27 @@ cpdef assert_almost_equal(a, b, diff += 1 if is_unequal: - from pandas.util.testing import raise_assert_detail - msg = '{0} values are different ({1} %)'.format( - obj, np.round(diff * 100.0 / na, 5)) + from pandas._testing import raise_assert_detail + msg = (f"{obj} values are different " + f"({np.round(diff * 100.0 / na, 5)} %)") raise_assert_detail(obj, msg, lobj, robj) return True elif isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - if a == b: - # object comparison - return True if isna(a) and isna(b): # TODO: Should require same-dtype NA? # nan / None comparison return True + + if a == b: + # object comparison + return True + if is_comparable_as_number(a) and is_comparable_as_number(b): if array_equivalent(a, b, strict_nan=True): # inf comparison @@ -206,12 +206,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ('(very low values) expected %.5f but ' - 'got %.5f, with decimal %d' % (fb, fa, decimal)) + assert False, (f'(very low values) expected {fb:.5f} ' + f'but got {fa:.5f}, with decimal {decimal}') else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, ('expected %.5f but got %.5f, ' - 'with decimal %d' % (fb, fa, decimal)) + assert False, (f'expected {fb:.5f} but got {fa:.5f}, ' + f'with decimal {decimal}') return True - raise AssertionError("{0} != {1}".format(a, b)) + raise AssertionError(f"{a} != {b}") diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 0f1657480e4b3..53e3354ca8eb6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -120,8 +120,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, elif box == "datetime": func_create = create_datetime_from_ts else: - raise ValueError("box must be one of 'datetime', 'date', 'time' or" - " 'timestamp'") + raise ValueError("box must be one of 'datetime', 'date', 'time' or 'timestamp'") if is_utc(tz) or tz is None: for i in range(n): @@ -188,7 +187,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, return result -def _test_parse_iso8601(object ts): +def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used only for testing, actual construction uses `convert_str_to_tsobject` @@ -266,20 +265,16 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif basic_format: dt64_to_dtstruct(val, &dts) - res = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, - dts.month, - dts.day, - dts.hour, - dts.min, - dts.sec) + res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') if show_ns: ns = dts.ps // 1000 - res += '.%.9d' % (ns + 1000 * dts.us) + res += f'.{ns + dts.us * 1000:09d}' elif show_us: - res += '.%.6d' % dts.us + res += f'.{dts.us:06d}' elif show_ms: - res += '.%.3d' % (dts.us /1000) + res += f'.{dts.us // 1000:03d}' result[i] = res @@ -300,10 +295,15 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, object unit, +def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, str errors='coerce'): """ - convert the ndarray according to the unit + Convert the ndarray to datetime according to the time unit. + + This function converts an array of objects into a numpy array of + datetime64[ns]. It returns the converted array + and also returns the timezone offset + if errors: - raise: return converted values or raise OutOfBoundsDatetime if out of range on the conversion or @@ -311,6 +311,18 @@ def array_with_unit_to_datetime(ndarray values, object unit, - ignore: return non-convertible values as the same unit - coerce: NaT for non-convertibles + Parameters + ---------- + values : ndarray of object + Date-like objects to convert + mask : ndarray of bool + Not-a-time mask for non-nullable integer types conversion, + can be None + unit : object + Time unit to use during conversion + errors : str, default 'raise' + Error behavior when parsing + Returns ------- result : ndarray of m8 values @@ -320,7 +332,6 @@ def array_with_unit_to_datetime(ndarray values, object unit, Py_ssize_t i, j, n=len(values) int64_t m ndarray[float64_t] fvalues - ndarray mask bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' bint is_raise = errors=='raise' @@ -333,9 +344,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, if unit == 'ns': if issubclass(values.dtype.type, np.integer): - return values.astype('M8[ns]'), tz - # This will return a tz - return array_to_datetime(values.astype(object), errors=errors) + result = values.astype('M8[ns]') + else: + result, tz = array_to_datetime(values.astype(object), errors=errors) + if mask is not None: + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz m = cast_from_unit(None, unit) @@ -347,7 +362,9 @@ def array_with_unit_to_datetime(ndarray values, object unit, if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == NPY_NAT + # If no mask, fill mask by comparing to NPY_NAT constant + if mask is None: + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False @@ -357,8 +374,8 @@ def array_with_unit_to_datetime(ndarray values, object unit, if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): - raise OutOfBoundsDatetime("cannot convert input with unit " - "'{unit}'".format(unit=unit)) + raise OutOfBoundsDatetime(f"cannot convert input with unit " + f"'{unit}'") result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = NPY_NAT @@ -384,8 +401,8 @@ def array_with_unit_to_datetime(ndarray values, object unit, except OverflowError: if is_raise: raise OutOfBoundsDatetime( - "cannot convert input {val} with the unit " - "'{unit}'".format(val=val, unit=unit)) + f"cannot convert input {val} with the unit " + f"'{unit}'") elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -400,16 +417,16 @@ def array_with_unit_to_datetime(ndarray values, object unit, except ValueError: if is_raise: raise ValueError( - "non convertible value {val} with the unit " - "'{unit}'".format(val=val, unit=unit)) + f"non convertible value {val} with the unit " + f"'{unit}'") elif is_ignore: raise AssertionError iresult[i] = NPY_NAT except OverflowError: if is_raise: raise OutOfBoundsDatetime( - "cannot convert input {val} with the unit " - "'{unit}'".format(val=val, unit=unit)) + f"cannot convert input {val} with the unit " + f"'{unit}'") elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -417,8 +434,8 @@ def array_with_unit_to_datetime(ndarray values, object unit, else: if is_raise: - raise ValueError("unit='{0}' not valid with non-numerical " - "val='{1}'".format(unit, val)) + raise ValueError(f"unit='{unit}' not valid with non-numerical " + f"val='{val}'") if is_ignore: raise AssertionError @@ -600,9 +617,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) + raise ValueError(f"time data {val} doesn't " + f"match format specified") return values, tz_out try: @@ -657,8 +673,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', if is_coerce: iresult[i] = NPY_NAT else: - raise TypeError("{typ} is not convertible to datetime" - .format(typ=type(val))) + raise TypeError(f"{type(val)} is not convertible to datetime") except OutOfBoundsDatetime: if is_coerce: diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 032363d867196..6e6b809b9b5a6 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -51,15 +51,18 @@ class NullFrequencyError(ValueError): pass -def maybe_integer_op_deprecated(obj): - # GH#22535 add/sub of integers and int-arrays is deprecated - if obj.freq is not None: - warnings.warn("Addition/subtraction of integers and integer-arrays " - "to {cls} is deprecated, will be removed in a future " - "version. Instead of adding/subtracting `n`, use " - "`n * self.freq`" - .format(cls=type(obj).__name__), - FutureWarning) +def integer_op_not_supported(obj): + # GH#22535 add/sub of integers and int-arrays is no longer allowed + # Note we return rather than raise the exception so we can raise in + # the caller; mypy finds this more palatable. + cls = type(obj).__name__ + + int_addsub_msg = ( + f"Addition/subtraction of integers and integer-arrays with {cls} is " + "no longer supported. Instead of adding/subtracting `n`, " + "use `n * obj.freq`" + ) + return TypeError(int_addsub_msg) cdef class _Timestamp(datetime): @@ -87,7 +90,7 @@ cdef class _Timestamp(datetime): return PyObject_RichCompareBool(val, other, op) try: - ots = self.__class__(other) + ots = type(self)(other) except ValueError: return self._compare_outside_nanorange(other, op) else: @@ -96,7 +99,7 @@ cdef class _Timestamp(datetime): if ndim != -1: if ndim == 0: if is_datetime64_object(other): - other = self.__class__(other) + other = type(self)(other) elif is_array(other): # zero-dim array, occurs if try comparison with # datetime64 scalar on the left hand side @@ -105,7 +108,7 @@ cdef class _Timestamp(datetime): # the numpy C api to extract it. other = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other) - other = self.__class__(other) + other = type(self)(other) else: return NotImplemented elif is_array(other): @@ -124,7 +127,7 @@ cdef class _Timestamp(datetime): # now __reduce_ex__ is defined and higher priority than __reduce__ return self.__reduce__() - def __repr__(self): + def __repr__(self) -> str: stamp = self._repr_base zone = None @@ -144,11 +147,10 @@ cdef class _Timestamp(datetime): # e.g. tzlocal has no `strftime` pass - tz = ", tz='{0}'".format(zone) if zone is not None else "" - freq = "" if self.freq is None else ", freq='{0}'".format(self.freqstr) + tz = f", tz='{zone}'" if zone is not None else "" + freq = "" if self.freq is None else f", freq='{self.freqstr}'" - return "Timestamp('{stamp}'{tz}{freq})".format(stamp=stamp, - tz=tz, freq=freq) + return f"Timestamp('{stamp}'{tz}{freq})" cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -201,7 +203,7 @@ cdef class _Timestamp(datetime): """ return np.datetime64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -227,20 +229,10 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') - return self.__class__(self.value + other_int, - tz=self.tzinfo, freq=self.freq) + return type(self)(self.value + other_int, tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): - maybe_integer_op_deprecated(self) - - if self is NaT: - # to be compat with Period - return NaT - elif self.freq is None: - raise NullFrequencyError( - "Cannot add integral value to Timestamp without freq.") - return self.__class__((self.freq * other).apply(self), - freq=self.freq) + raise integer_op_not_supported(self) elif PyDelta_Check(other) or hasattr(other, 'delta'): # delta --> offsets.Tick @@ -254,18 +246,12 @@ cdef class _Timestamp(datetime): other.seconds * 1000000 + other.microseconds) * 1000 - result = self.__class__(self.value + nanos, - tz=self.tzinfo, freq=self.freq) + result = type(self)(self.value + nanos, tz=self.tzinfo, freq=self.freq) return result elif is_array(other): if other.dtype.kind in ['i', 'u']: - maybe_integer_op_deprecated(self) - if self.freq is None: - raise NullFrequencyError( - "Cannot add integer-dtype array " - "to Timestamp without freq.") - return self.freq * other + self + raise integer_op_not_supported(self) # index/series like elif hasattr(other, '_typ'): @@ -273,7 +259,7 @@ cdef class _Timestamp(datetime): result = datetime.__add__(self, other) if PyDateTime_Check(result): - result = self.__class__(result) + result = type(self)(result) result.nanosecond = self.nanosecond return result @@ -287,12 +273,7 @@ cdef class _Timestamp(datetime): elif is_array(other): if other.dtype.kind in ['i', 'u']: - maybe_integer_op_deprecated(self) - if self.freq is None: - raise NullFrequencyError( - "Cannot subtract integer-dtype array " - "from Timestamp without freq.") - return self - self.freq * other + raise integer_op_not_supported(self) typ = getattr(other, '_typ', None) if typ is not None: @@ -305,9 +286,9 @@ cdef class _Timestamp(datetime): if (PyDateTime_Check(self) and (PyDateTime_Check(other) or is_datetime64_object(other))): if isinstance(self, _Timestamp): - other = self.__class__(other) + other = type(self)(other) else: - self = other.__class__(self) + self = type(other)(self) # validate tz's if not tz_compare(self.tzinfo, other.tzinfo): @@ -369,29 +350,28 @@ cdef class _Timestamp(datetime): return out[0] @property - def _repr_base(self): - return '{date} {time}'.format(date=self._date_repr, - time=self._time_repr) + def _repr_base(self) -> str: + return f"{self._date_repr} {self._time_repr}" @property - def _date_repr(self): + def _date_repr(self) -> str: # Ideal here would be self.strftime("%Y-%m-%d"), but # the datetime strftime() methods require year >= 1900 - return '%d-%.2d-%.2d' % (self.year, self.month, self.day) + return f'{self.year}-{self.month:02d}-{self.day:02d}' @property - def _time_repr(self): - result = '%.2d:%.2d:%.2d' % (self.hour, self.minute, self.second) + def _time_repr(self) -> str: + result = f'{self.hour:02d}:{self.minute:02d}:{self.second:02d}' if self.nanosecond != 0: - result += '.%.9d' % (self.nanosecond + 1000 * self.microsecond) + result += f'.{self.nanosecond + 1000 * self.microsecond:09d}' elif self.microsecond != 0: - result += '.%.6d' % self.microsecond + result += f'.{self.microsecond:06d}' return result @property - def _short_repr(self): + def _short_repr(self) -> str: # format a Timestamp with only _date_repr if possible # otherwise _repr_base if (self.hour == 0 and @@ -403,7 +383,7 @@ cdef class _Timestamp(datetime): return self._repr_base @property - def asm8(self): + def asm8(self) -> np.datetime64: """ Return numpy datetime64 format in nanoseconds. """ diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index a82d5e3b58e5e..0588dfe20e2e2 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -58,7 +58,8 @@ HOUR_SECONDS = 3600 @cython.wraparound(False) @cython.boundscheck(False) cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil: - """Return the number of days in the given month of the given year. + """ + Return the number of days in the given month of the given year. Parameters ---------- @@ -81,7 +82,8 @@ cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil: @cython.boundscheck(False) @cython.cdivision cdef int dayofweek(int y, int m, int d) nogil: - """Find the day of week for the date described by the Y/M/D triple y, m, d + """ + Find the day of week for the date described by the Y/M/D triple y, m, d using Sakamoto's method, from wikipedia. 0 represents Monday. See [1]_. @@ -117,7 +119,8 @@ cdef int dayofweek(int y, int m, int d) nogil: cdef bint is_leapyear(int64_t year) nogil: - """Returns 1 if the given year is a leap year, 0 otherwise. + """ + Returns 1 if the given year is a leap year, 0 otherwise. Parameters ---------- @@ -134,7 +137,8 @@ cdef bint is_leapyear(int64_t year) nogil: @cython.wraparound(False) @cython.boundscheck(False) cpdef int32_t get_week_of_year(int year, int month, int day) nogil: - """Return the ordinal week-of-year for the given day. + """ + Return the ordinal week-of-year for the given day. Parameters ---------- @@ -178,7 +182,8 @@ cpdef int32_t get_week_of_year(int year, int month, int day) nogil: @cython.wraparound(False) @cython.boundscheck(False) cpdef int32_t get_day_of_year(int year, int month, int day) nogil: - """Return the ordinal day-of-year for the given day. + """ + Return the ordinal day-of-year for the given day. Parameters ---------- @@ -207,8 +212,9 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: return day_of_year -def get_locale_names(name_type: object, locale: object=None): - """Returns an array of localized day or month names +def get_locale_names(name_type: str, locale: object = None): + """ + Returns an array of localized day or month names. Parameters ---------- diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0b77948027ad7..36e6b14be182a 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from cpython.datetime cimport datetime, tzinfo +from cpython.datetime cimport datetime from numpy cimport int64_t, int32_t diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index bd74180403ad9..2988d7bae9a5e 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -197,7 +197,7 @@ def datetime_to_datetime64(object[:] values): iresult[i] = pydatetime_to_dt64(val, &dts) check_dts_bounds(&dts) else: - raise TypeError('Unrecognized value type: %s' % type(val)) + raise TypeError(f'Unrecognized value type: {type(val)}') return result, inferred_tz @@ -326,8 +326,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, raise ValueError("Cannot convert Period to Timestamp " "unambiguously. Use to_timestamp") else: - raise TypeError('Cannot convert input [{}] of type {} to ' - 'Timestamp'.format(ts, type(ts))) + raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to ' + f'Timestamp') if tz is not None: localize_tso(obj, tz) @@ -444,15 +444,15 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, bint dayfirst=False, bint yearfirst=False): """ - Convert a string-like (bytes or unicode) input `ts`, along with optional - timezone object `tz` to a _TSObject. + Convert a string input `ts`, along with optional timezone object`tz` + to a _TSObject. The optional arguments `dayfirst` and `yearfirst` are passed to the dateutil parser. Parameters ---------- - ts : bytes or unicode + ts : str Value to be converted to _TSObject tz : tzinfo or None timezone for the timezone-aware output @@ -686,7 +686,7 @@ def normalize_date(dt: object) -> datetime: elif PyDate_Check(dt): return datetime(dt.year, dt.month, dt.day) else: - raise TypeError('Unrecognized type: %s' % type(dt)) + raise TypeError(f'Unrecognized type: {type(dt)}') @cython.wraparound(False) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 8f5c8d10776df..8bee7da6231ba 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -90,7 +90,7 @@ def build_field_sarray(const int64_t[:] dtindex): def get_date_name_field(const int64_t[:] dtindex, object field, object locale=None): """ Given a int64-based datetime index, return array of strings of date - name based on requested field (e.g. weekday_name) + name based on requested field (e.g. day_name) """ cdef: Py_ssize_t i, count = len(dtindex) @@ -100,7 +100,7 @@ def get_date_name_field(const int64_t[:] dtindex, object field, object locale=No out = np.empty(count, dtype=object) - if field == 'day_name' or field == 'weekday_name': + if field == 'day_name': if locale is None: names = np.array(DAYS_FULL, dtype=np.object_) else: @@ -130,7 +130,7 @@ def get_date_name_field(const int64_t[:] dtindex, object field, object locale=No out[i] = names[dts.month].capitalize() else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out @@ -165,8 +165,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by {field}" - .format(field=field)) + raise ValueError(f"Custom business days is not supported by {field}") is_business = freqstr[0] == 'B' # YearBegin(), BYearBegin() use month = starting month of year. @@ -373,7 +372,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, out[i] = 1 else: - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") return out.view(bool) @@ -537,7 +536,7 @@ def get_date_field(const int64_t[:] dtindex, object field): elif field == 'is_leap_year': return isleapyear_arr(get_date_field(dtindex, 'Y')) - raise ValueError("Field {field} not supported".format(field=field)) + raise ValueError(f"Field {field} not supported") @cython.wraparound(False) @@ -653,7 +652,7 @@ def get_timedelta_field(const int64_t[:] tdindex, object field): out[i] = tds.nanoseconds return out - raise ValueError("Field %s not supported" % field) + raise ValueError(f"Field {field} not supported") cpdef isleapyear_arr(ndarray years): diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd index 4e7949e55c836..6ec67ce250505 100644 --- a/pandas/_libs/tslibs/frequencies.pxd +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -cpdef object get_rule_month(object source, object default=*) +cpdef str get_rule_month(object source, str default=*) cpdef get_freq_code(freqstr) cpdef object get_freq(object freq) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index b29c841896072..d60f5cfd3f8c1 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -197,7 +197,7 @@ cpdef _base_and_stride(str freqstr): groups = opattern.match(freqstr) if not groups: - raise ValueError("Could not evaluate {freq}".format(freq=freqstr)) + raise ValueError(f"Could not evaluate {freqstr}") stride = groups.group(1) @@ -485,18 +485,18 @@ cdef bint _is_weekly(str rule): # ---------------------------------------------------------------------- -cpdef object get_rule_month(object source, object default='DEC'): +cpdef str get_rule_month(object source, str default="DEC"): """ Return starting month of given freq, default is December. Parameters ---------- source : object - default : object (default "DEC") + default : str, default "DEC" Returns ------- - rule_month: object (usually string) + rule_month: str Examples -------- diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 0bd4b78d51e4e..67c0f0cc33ab8 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -5,6 +5,9 @@ from cpython.object cimport ( from cpython.datetime cimport (datetime, PyDateTime_Check, PyDelta_Check, PyDateTime_IMPORT) + +from cpython.version cimport PY_MINOR_VERSION + PyDateTime_IMPORT import numpy as np @@ -19,6 +22,7 @@ from pandas._libs.tslibs.util cimport ( get_nat, is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object) + # ---------------------------------------------------------------------- # Constants nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'} @@ -95,10 +99,6 @@ cdef class _NaT(datetime): # higher than np.ndarray and np.matrix __array_priority__ = 100 - def __hash__(_NaT self): - # py3k needs this defined here - return hash(self.value) - def __richcmp__(_NaT self, object other, int op): cdef: int ndim = getattr(other, 'ndim', -1) @@ -115,8 +115,8 @@ cdef class _NaT(datetime): if is_datetime64_object(other): return _nat_scalar_rules[op] else: - raise TypeError('Cannot compare type %r with type %r' % - (type(self).__name__, type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} ' + f'with type {type(other).__name__}') # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, @@ -150,8 +150,7 @@ cdef class _NaT(datetime): result = np.empty(other.shape, dtype="datetime64[ns]") result.fill("NaT") return result - raise TypeError("Cannot add NaT to ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}") return NotImplemented @@ -203,9 +202,8 @@ cdef class _NaT(datetime): result.fill("NaT") return result - raise TypeError( - "Cannot subtract NaT from ndarray with dtype {dtype}" - .format(dtype=other.dtype)) + raise TypeError(f"Cannot subtract NaT from ndarray with " + f"dtype {other.dtype}") return NotImplemented @@ -230,16 +228,16 @@ cdef class _NaT(datetime): return NotImplemented @property - def asm8(self): + def asm8(self) -> np.datetime64: return np.datetime64(NPY_NAT, 'ns') - def to_datetime64(self): + def to_datetime64(self) -> np.datetime64: """ Return a numpy.datetime64 object with 'ns' precision. """ return np.datetime64('NaT', 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ Convert the Timestamp to a NumPy datetime64. @@ -259,13 +257,13 @@ cdef class _NaT(datetime): """ return self.to_datetime64() - def __repr__(self): + def __repr__(self) -> str: return 'NaT' - def __str__(self): + def __str__(self) -> str: return 'NaT' - def isoformat(self, sep='T'): + def isoformat(self, sep='T') -> str: # This allows Timestamp(ts.isoformat()) to always correctly roundtrip. return 'NaT' @@ -370,7 +368,6 @@ class NaTType(_NaT): days_in_month = property(fget=lambda self: np.nan) daysinmonth = property(fget=lambda self: np.nan) dayofweek = property(fget=lambda self: np.nan) - weekday_name = property(fget=lambda self: np.nan) # inject Timedelta properties days = property(fget=lambda self: np.nan) @@ -434,6 +431,10 @@ class NaTType(_NaT): tzname = _make_error_func('tzname', datetime) utcoffset = _make_error_func('utcoffset', datetime) + # "fromisocalendar" was introduced in 3.8 + if PY_MINOR_VERSION >= 8: + fromisocalendar = _make_error_func('fromisocalendar', datetime) + # ---------------------------------------------------------------------- # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. @@ -464,7 +465,7 @@ class NaTType(_NaT): """ Timestamp.combine(date, time) - date, time -> datetime with same date and time fields + date, time -> datetime with same date and time fields. """ ) utcnow = _make_error_func('utcnow', # noqa:E128 @@ -503,8 +504,8 @@ class NaTType(_NaT): """ Timestamp.fromordinal(ordinal, freq=None, tz=None) - passed an ordinal, translate and convert to a ts - note: by definition there cannot be any tz info on the ordinal itself + Passed an ordinal, translate and convert to a ts. + Note: by definition there cannot be any tz info on the ordinal itself. Parameters ---------- @@ -727,18 +728,6 @@ default 'raise' nonexistent times. .. versionadded:: 0.24.0 - errors : 'raise', 'coerce', default None - Determine how errors should be handled. - - The behavior is as follows: - - * 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - * 'coerce' will return NaT if the timestamp can not be converted - into the specified timezone. Use ``nonexistent='NaT'`` instead. - - .. deprecated:: 0.24.0 Returns ------- diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 020bcdf0a7b15..ebedee79405e5 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -72,6 +72,6 @@ cdef npy_datetime get_datetime64_value(object obj) nogil cdef npy_timedelta get_timedelta64_value(object obj) nogil cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil -cdef int _string_to_dts(object val, npy_datetimestruct* dts, +cdef int _string_to_dts(str val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset, bint want_exc) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index e76f84265a327..b59a1101e0bf7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -112,11 +112,9 @@ cdef inline check_dts_bounds(npy_datetimestruct *dts): error = True if error: - fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, - dts.day, dts.hour, - dts.min, dts.sec) - raise OutOfBoundsDatetime( - 'Out of bounds nanosecond timestamp: {fmt}'.format(fmt=fmt)) + fmt = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} ' + f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}') + raise OutOfBoundsDatetime(f'Out of bounds nanosecond timestamp: {fmt}') # ---------------------------------------------------------------------- @@ -169,7 +167,7 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): return dtstruct_to_dt64(dts) -cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, +cdef inline int _string_to_dts(str val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset, bint want_exc) except? -1: cdef: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index aaefab6ee7ff6..f24dce28cd5f7 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,6 +1,7 @@ import cython import time +from typing import Any from cpython.datetime cimport (PyDateTime_IMPORT, PyDateTime_Check, PyDelta_Check, @@ -66,16 +67,16 @@ need_suffix = ['QS', 'BQ', 'BQS', 'YS', 'AS', 'BY', 'BA', 'BYS', 'BAS'] for __prefix in need_suffix: for _m in MONTHS: - key = '%s-%s' % (__prefix, _m) + key = f'{__prefix}-{_m}' _offset_to_period_map[key] = _offset_to_period_map[__prefix] for __prefix in ['A', 'Q']: for _m in MONTHS: - _alias = '%s-%s' % (__prefix, _m) + _alias = f'{__prefix}-{_m}' _offset_to_period_map[_alias] = _alias for _d in DAYS: - _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + _offset_to_period_map[f'W-{_d}'] = f'W-{_d}' # --------------------------------------------------------------------- @@ -328,7 +329,7 @@ class _BaseOffset: def __setattr__(self, name, value): raise AttributeError("DateOffset objects are immutable.") - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, str): try: # GH#23524 if to_offset fails, we are dealing with an @@ -363,7 +364,7 @@ class _BaseOffset: attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude) and (k[0] != '_')] attrs = sorted(set(attrs)) - params = tuple([str(self.__class__)] + attrs) + params = tuple([str(type(self))] + attrs) return params @property @@ -422,7 +423,7 @@ class _BaseOffset: # that allows us to use methods that can go in a `cdef class` return self * 1 - def __repr__(self): + def __repr__(self) -> str: className = getattr(self, '_outputName', type(self).__name__) if abs(self.n) != 1: @@ -432,9 +433,9 @@ class _BaseOffset: n_str = "" if self.n != 1: - n_str = "%s * " % self.n + n_str = f"{self.n} * " - out = '<%s' % n_str + className + plural + self._repr_attrs() + '>' + out = f'<{n_str}{className}{plural}{self._repr_attrs()}>' return out def _get_offset_day(self, datetime other): @@ -460,16 +461,13 @@ class _BaseOffset: ValueError if n != int(n) """ if util.is_timedelta64_object(n): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') try: nint = int(n) except (ValueError, TypeError): - raise TypeError('`n` argument must be an integer, ' - 'got {ntype}'.format(ntype=type(n))) + raise TypeError(f'`n` argument must be an integer, got {type(n)}') if n != nint: - raise ValueError('`n` argument must be an integer, ' - 'got {n}'.format(n=n)) + raise ValueError(f'`n` argument must be an integer, got {n}') return nint def __setstate__(self, state): @@ -935,7 +933,7 @@ def shift_month(stamp: datetime, months: int, cpdef int get_day_of_month(datetime other, day_opt) except? -1: """ - Find the day in `other`'s month that satisfies a DateOffset's onOffset + Find the day in `other`'s month that satisfies a DateOffset's is_on_offset policy, as described by the `day_opt` argument. Parameters diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8fe724fa2f6f7..3705b0a41fe55 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -3,7 +3,6 @@ Parsing functions for datetime and datetime-like strings. """ import re import time -from io import StringIO from libc.string cimport strchr @@ -11,9 +10,8 @@ import cython from cython import Py_ssize_t from cpython.object cimport PyObject_Str -from cpython.unicode cimport PyUnicode_Join -from cpython.datetime cimport datetime, datetime_new, import_datetime +from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo from cpython.version cimport PY_VERSION_HEX import_datetime() @@ -37,6 +35,7 @@ from pandas._config import get_option from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS from pandas._libs.tslibs.nattype import nat_strings, NaT from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size +from pandas._libs.tslibs.frequencies cimport get_rule_month cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil @@ -86,16 +85,15 @@ cdef inline int _parse_4digit(const char* s): return result -cdef inline object _parse_delimited_date(object date_string, bint dayfirst): +cdef inline object _parse_delimited_date(str date_string, bint dayfirst): """ Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY. + At the beginning function tries to parse date in MM/DD/YYYY format, but if month > 12 - in DD/MM/YYYY (`dayfirst == False`). With `dayfirst == True` function makes an attempt to parse date in DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY - Note - ---- For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-. For MM/YYYY: delimiter can be a space or one of /- If `date_string` can't be converted to date, then function returns @@ -104,11 +102,13 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): Parameters ---------- date_string : str - dayfirst : bint + dayfirst : bool Returns: -------- - datetime, resolution + datetime or None + str or None + Describing resolution of the parsed string. """ cdef: const char* buf @@ -153,21 +153,22 @@ cdef inline object _parse_delimited_date(object date_string, bint dayfirst): return datetime_new(year, month, day, 0, 0, 0, 0, None), reso return datetime(year, month, day, 0, 0, 0, 0, None), reso - raise DateParseError("Invalid date specified ({}/{})".format(month, day)) + raise DateParseError(f"Invalid date specified ({month}/{day})") -cdef inline bint does_string_look_like_time(object parse_string): +cdef inline bint does_string_look_like_time(str parse_string): """ Checks whether given string is a time: it has to start either from H:MM or from HH:MM, and hour and minute values must be valid. Parameters ---------- - date_string : str + parse_string : str Returns: -------- - whether given string is a time + bool + Whether given string is potentially a time. """ cdef: const char* buf @@ -188,9 +189,10 @@ cdef inline bint does_string_look_like_time(object parse_string): return 0 <= hour <= 23 and 0 <= minute <= 59 -def parse_datetime_string(date_string, freq=None, dayfirst=False, +def parse_datetime_string(date_string: str, freq=None, dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. + """ + Parse datetime string, only returns datetime. Also cares special handling matching time patterns. Returns @@ -270,16 +272,17 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): return res -cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, +cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, yearfirst=False): - """parse datetime string, only returns datetime + """ + Parse datetime string and try to identify its resolution. Returns ------- - parsed : datetime - parsed2 : datetime/dateutil.parser._result - reso : str - inferred resolution + datetime + datetime/dateutil.parser._result + str + Inferred resolution of the parsed string. Raises ------ @@ -311,22 +314,23 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, # TODO: allow raise of errors within instead raise DateParseError(err) if parsed is None: - raise DateParseError("Could not parse {dstr}".format(dstr=date_string)) + raise DateParseError(f"Could not parse {date_string}") return parsed, parsed, reso -cpdef bint _does_string_look_like_datetime(object py_string): +cpdef bint _does_string_look_like_datetime(str py_string): """ Checks whether given string is a datetime: it has to start with '0' or be greater than 1000. Parameters ---------- - py_string: object + py_string: str Returns ------- - whether given string is a datetime + bool + Whether given string is potentially a datetime. """ cdef: const char *buf @@ -370,9 +374,6 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) - # len(date_string) == 0 - # should be NaT??? - if date_string in nat_strings: return NaT, NaT, '' @@ -420,18 +421,18 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {dstr}') - raise DateParseError(msg.format(dstr=date_string)) + raise DateParseError(f'Incorrect quarterly string is given, ' + f'quarter must be ' + f'between 1 and 4: {date_string}') if freq is not None: # hack attack, #1228 try: - mnum = MONTH_NUMBERS[_get_rule_month(freq)] + 1 + mnum = MONTH_NUMBERS[get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {freq}'.format(freq=freq)) - raise DateParseError(msg) + raise DateParseError(f'Unable to retrieve month ' + f'information from given ' + f'freq: {freq}') month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: @@ -464,28 +465,22 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - raise ValueError('Unable to parse {0}'.format(date_string)) + raise ValueError(f'Unable to parse {date_string}') -cdef dateutil_parse(object timestr, object default, ignoretz=False, +cdef dateutil_parse(str timestr, object default, ignoretz=False, tzinfos=None, dayfirst=None, yearfirst=None): """ lifted from dateutil to get resolution""" cdef: - object fobj, res, attr, ret, tzdata + object res, attr, ret, tzdata object reso = None dict repl = {} - fobj = StringIO(str(timestr)) - res = DEFAULTPARSER._parse(fobj, dayfirst=dayfirst, yearfirst=yearfirst) - - # dateutil 2.2 compat - if isinstance(res, tuple): # PyTuple_Check - res, _ = res + res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst) if res is None: - msg = "Unknown datetime string format, unable to parse: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}") for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: @@ -495,8 +490,7 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, reso = attr if reso is None: - msg = "Unable to parse datetime string: {timestr}" - raise ValueError(msg.format(timestr=timestr)) + raise ValueError(f"Unable to parse datetime string: {timestr}") if reso == 'microsecond': if repl['microsecond'] == 0: @@ -509,20 +503,22 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, ret = ret + relativedelta.relativedelta(weekday=res.weekday) if not ignoretz: if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + # Note: as of 1.0 this is not reached because + # we never pass tzinfos, see GH#22234 if callable(tzinfos): tzdata = tzinfos(res.tzname, res.tzoffset) else: tzdata = tzinfos.get(res.tzname) - if isinstance(tzdata, datetime.tzinfo): - tzinfo = tzdata + if isinstance(tzdata, tzinfo): + new_tzinfo = tzdata elif isinstance(tzdata, str): - tzinfo = _dateutil_tzstr(tzdata) + new_tzinfo = _dateutil_tzstr(tzdata) elif isinstance(tzdata, int): - tzinfo = tzoffset(res.tzname, tzdata) + new_tzinfo = tzoffset(res.tzname, tzdata) else: raise ValueError("offset must be tzinfo subclass, " "tz string, or int offset") - ret = ret.replace(tzinfo=tzinfo) + ret = ret.replace(tzinfo=new_tzinfo) elif res.tzname and res.tzname in time.tzname: ret = ret.replace(tzinfo=_dateutil_tzlocal()) elif res.tzoffset == 0: @@ -532,27 +528,6 @@ cdef dateutil_parse(object timestr, object default, ignoretz=False, return ret, reso -cdef object _get_rule_month(object source, object default='DEC'): - """ - Return starting month of given freq, default is December. - - Example - ------- - >>> _get_rule_month('D') - 'DEC' - - >>> _get_rule_month('A-JAN') - 'JAN' - """ - if hasattr(source, 'freqstr'): - source = source.freqstr - source = source.upper() - if '-' not in source: - return default - else: - return source.split('-')[1] - - # ---------------------------------------------------------------------- # Parsing for type-inference @@ -710,7 +685,7 @@ class _timelex: elif getattr(instream, 'read', None) is None: raise TypeError( 'Parser must be a string or character stream, not ' - '{itype}'.format(itype=instream.__class__.__name__)) + f'{type(instream).__name__}') else: self.stream = instream.read() @@ -941,14 +916,14 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): Parameters ---------- - date_cols : tuple of numpy arrays + date_cols : tuple[ndarray] keep_trivial_numbers : bool, default True if True and len(date_cols) == 1, then conversion (to string from integer/float zero) is not performed Returns ------- - arr_of_rows : ndarray (dtype=object) + arr_of_rows : ndarray[object] Examples -------- @@ -1006,6 +981,6 @@ def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) list_to_join[col_idx] = convert_to_unicode(item, False) PyArray_ITER_NEXT(it) - result_view[row_idx] = PyUnicode_Join(' ', list_to_join) + result_view[row_idx] = " ".join(list_to_join) return result diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index aed64aff14e0a..bd57e75c72f19 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1191,12 +1191,15 @@ cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: return dtstruct_to_dt64(&dts) -def period_format(int64_t value, int freq, object fmt=None): +cdef str period_format(int64_t value, int freq, object fmt=None): cdef: int freq_group if value == NPY_NAT: - return repr(NaT) + return "NaT" + + if isinstance(fmt, str): + fmt = fmt.encode("utf-8") if fmt is None: freq_group = get_freq_group(freq) @@ -1209,8 +1212,7 @@ def period_format(int64_t value, int freq, object fmt=None): elif freq_group == 4000: # WK left = period_asfreq(value, freq, 6000, 0) right = period_asfreq(value, freq, 6000, 1) - return '%s/%s' % (period_format(left, 6000), - period_format(right, 6000)) + return f"{period_format(left, 6000)}/{period_format(right, 6000)}" elif (freq_group == 5000 # BUS or freq_group == 6000): # DAY fmt = b'%Y-%m-%d' @@ -1227,7 +1229,7 @@ def period_format(int64_t value, int freq, object fmt=None): elif freq_group == 12000: # NANOSEC fmt = b'%Y-%m-%d %H:%M:%S.%n' else: - raise ValueError('Unknown freq: {freq}'.format(freq=freq)) + raise ValueError(f"Unknown freq: {freq}") return _period_strftime(value, freq, fmt) @@ -1242,24 +1244,22 @@ cdef list extra_fmts = [(b"%q", b"^`AB`^"), cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", "^`GH`^", "^`IJ`^", "^`KL`^"] -cdef object _period_strftime(int64_t value, int freq, object fmt): +cdef str _period_strftime(int64_t value, int freq, bytes fmt): cdef: Py_ssize_t i npy_datetimestruct dts char *formatted - object pat, repl, result + bytes pat, brepl list found_pat = [False] * len(extra_fmts) int year, quarter - - if isinstance(fmt, unicode): - fmt = fmt.encode('utf-8') + str result, repl get_date_info(value, freq, &dts) for i in range(len(extra_fmts)): pat = extra_fmts[i][0] - repl = extra_fmts[i][1] + brepl = extra_fmts[i][1] if pat in fmt: - fmt = fmt.replace(pat, repl) + fmt = fmt.replace(pat, brepl) found_pat[i] = True formatted = c_strftime(&dts, fmt) @@ -1273,17 +1273,17 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): raise ValueError('Unable to get quarter and year') if i == 0: - repl = '%d' % quarter + repl = str(quarter) elif i == 1: # %f, 2-digit year - repl = '%.2d' % (year % 100) + repl = f"{(year % 100):02d}" elif i == 2: - repl = '%d' % year + repl = str(year) elif i == 3: - repl = '%03d' % (value % 1000) + repl = f"{(value % 1_000):03d}" elif i == 4: - repl = '%06d' % (value % 1000000) + repl = f"{(value % 1_000_000):06d}" elif i == 5: - repl = '%09d' % (value % 1000000000) + repl = f"{(value % 1_000_000_000):09d}" result = result.replace(str_extra_fmts[i], repl) @@ -1391,7 +1391,7 @@ def get_period_field_arr(int code, int64_t[:] arr, int freq): func = _get_accessor_func(code) if func is NULL: - raise ValueError('Unrecognized period code: {code}'.format(code=code)) + raise ValueError(f"Unrecognized period code: {code}") sz = len(arr) out = np.empty(sz, dtype=np.int64) @@ -1578,9 +1578,8 @@ cdef class _Period: freq = to_offset(freq) if freq.n <= 0: - raise ValueError('Frequency must be positive, because it' - ' represents span: {freqstr}' - .format(freqstr=freq.freqstr)) + raise ValueError("Frequency must be positive, because it " + f"represents span: {freq.freqstr}") return freq @@ -1614,9 +1613,8 @@ cdef class _Period: return NotImplemented elif op == Py_NE: return NotImplemented - raise TypeError('Cannot compare type {cls} with type {typ}' - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError(f"Cannot compare type {type(self).__name__} " + f"with type {type(other).__name__}") def __hash__(self): return hash((self.ordinal, self.freqstr)) @@ -1634,8 +1632,8 @@ cdef class _Period: if nanos % offset_nanos == 0: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - msg = 'Input cannot be converted to Period(freq={0})' - raise IncompatibleFrequency(msg.format(self.freqstr)) + raise IncompatibleFrequency("Input cannot be converted to " + f"Period(freq={self.freqstr})") elif util.is_offset_object(other): freqstr = other.rule_code base = get_base_alias(freqstr) @@ -1665,9 +1663,8 @@ cdef class _Period: # GH#17983 sname = type(self).__name__ oname = type(other).__name__ - raise TypeError("unsupported operand type(s) for +: '{self}' " - "and '{other}'".format(self=sname, - other=oname)) + raise TypeError(f"unsupported operand type(s) for +: '{sname}' " + f"and '{oname}'") else: # pragma: no cover return NotImplemented elif is_period_object(other): @@ -2204,7 +2201,7 @@ cdef class _Period: return self.days_in_month @property - def is_leap_year(self): + def is_leap_year(self) -> bool: return bool(is_leapyear(self.year)) @classmethod @@ -2215,18 +2212,18 @@ cdef class _Period: def freqstr(self): return self.freq.freqstr - def __repr__(self): + def __repr__(self) -> str: base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - return "Period('%s', '%s')" % (formatted, self.freqstr) + return f"Period('{formatted}', '{self.freqstr}')" - def __str__(self): + def __str__(self) -> str: """ Return a string representation for a particular DataFrame """ base, mult = get_freq_code(self.freq) formatted = period_format(self.ordinal, base) - value = ("%s" % formatted) + value = str(formatted) return value def __setstate__(self, state): @@ -2237,14 +2234,14 @@ cdef class _Period: object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt): + def strftime(self, fmt: str) -> str: """ Returns the string representation of the :class:`Period`, depending on the selected ``fmt``. ``fmt`` must be a string containing one or several directives. The method recognizes the same directives as the :func:`time.strftime` function of the standard Python distribution, as well as the specific additional directives ``%f``, - ``%F``, ``%q``. (formatting & docs originally from scikits.timeries) + ``%F``, ``%q``. (formatting & docs originally from scikits.timeries). +-----------+--------------------------------+-------+ | Directive | Meaning | Notes | @@ -2477,9 +2474,8 @@ class Period(_Period): try: freq = Resolution.get_freq(reso) except KeyError: - raise ValueError( - "Invalid frequency or could not infer: {reso}" - .format(reso=reso)) + raise ValueError(f"Invalid frequency or could not " + f"infer: {reso}") elif PyDateTime_Check(value): dt = value diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fbda5f178e164..5508b208de00a 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -106,11 +106,11 @@ def array_strptime(object[:] values, object fmt, if bad_directive == "\\": bad_directive = "%" del err - raise ValueError("'%s' is a bad directive in format '%s'" % - (bad_directive, fmt)) + raise ValueError(f"'{bad_directive}' is a bad directive " + f"in format '{fmt}'") # IndexError only occurs when the format string is "%" except IndexError: - raise ValueError("stray %% in format '%s'" % fmt) + raise ValueError(f"stray % in format '{fmt}'") _regex_cache[fmt] = format_regex result = np.empty(n, dtype='M8[ns]') @@ -139,14 +139,13 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match " - "format %r (match)" % (val, fmt)) + raise ValueError(f"time data '{val}' does not match " + f"format '{fmt}' (match)") if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("unconverted data remains: %s" % - val[found.end():]) + raise ValueError(f"unconverted data remains: {val[found.end():]}") # search else: @@ -155,8 +154,8 @@ def array_strptime(object[:] values, object fmt, if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format " - "%r (search)" % (val, fmt)) + raise ValueError(f"time data {repr(val)} does not match format " + f"{repr(fmt)} (search)") iso_year = -1 year = 1900 @@ -279,8 +278,8 @@ def array_strptime(object[:] values, object fmt, "the ISO year directive '%G' and a weekday " "directive '%A', '%a', '%w', or '%u'.") else: - raise ValueError("ISO week directive '%V' is incompatible with" - " the year directive '%Y'. Use the ISO year " + raise ValueError("ISO week directive '%V' is incompatible with " + "the year directive '%Y'. Use the ISO year " "'%G' instead.") # If we know the wk of the year and what day of that wk, we can figure @@ -589,8 +588,8 @@ class TimeRE(dict): else: return '' regex = '|'.join(re.escape(stuff) for stuff in to_convert) - regex = '(?P<%s>%s' % (directive, regex) - return '%s)' % regex + regex = f"(?P<{directive}>{regex})" + return regex def pattern(self, format): """ @@ -609,11 +608,11 @@ class TimeRE(dict): format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: directive_index = format.index('%') +1 - processed_format = "%s%s%s" % (processed_format, - format[:directive_index -1], - self[format[directive_index]]) + processed_format = (f"{processed_format}" + f"{format[:directive_index -1]}" + f"{self[format[directive_index]]}") format = format[directive_index +1:] - return "%s%s" % (processed_format, format) + return f"{processed_format}{format}" def compile(self, format): """Return a compiled re object for the format string.""" @@ -737,8 +736,7 @@ cdef parse_timezone_directive(str z): z = z[:3] + z[4:] if len(z) > 5: if z[5] != ':': - msg = "Inconsistent use of : in {0}" - raise ValueError(msg.format(z)) + raise ValueError(f"Inconsistent use of : in {z}") z = z[:5] + z[6:] hours = int(z[1:3]) minutes = int(z[3:5]) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8435f1cd7d732..0a773b8a215ed 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,6 +1,5 @@ import collections import textwrap -import warnings import cython @@ -170,7 +169,7 @@ cdef convert_to_timedelta64(object ts, object unit): if ts.astype('int64') == NPY_NAT: return np.timedelta64(NPY_NAT) elif is_timedelta64_object(ts): - ts = ts.astype("m8[{unit}]".format(unit=unit.lower())) + ts = ts.astype(f"m8[{unit.lower()}]") elif is_integer_object(ts): if ts == NPY_NAT: return np.timedelta64(NPY_NAT) @@ -198,8 +197,7 @@ cdef convert_to_timedelta64(object ts, object unit): if PyDelta_Check(ts): ts = np.timedelta64(delta_to_nanoseconds(ts), 'ns') elif not is_timedelta64_object(ts): - raise ValueError("Invalid type for timedelta " - "scalar: {ts_type}".format(ts_type=type(ts))) + raise ValueError(f"Invalid type for timedelta scalar: {type(ts)}") return ts.astype('timedelta64[ns]') @@ -288,7 +286,7 @@ cpdef inline object precision_from_unit(object unit): m = 1L p = 0 else: - raise ValueError("cannot cast unit {unit}".format(unit=unit)) + raise ValueError(f"cannot cast unit {unit}") return m, p @@ -397,8 +395,7 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - raise ValueError("expecting hh:mm:ss format, " - "received: {ts}".format(ts=ts)) + raise ValueError(f"expecting hh:mm:ss format, received: {ts}") unit, number = [], [] @@ -511,7 +508,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): unit = 'm' unit = parse_timedelta_unit(unit) except KeyError: - raise ValueError("invalid abbreviation: {unit}".format(unit=unit)) + raise ValueError(f"invalid abbreviation: {unit}") n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) @@ -530,8 +527,7 @@ cpdef inline object parse_timedelta_unit(object unit): try: return timedelta_abbrevs[unit.lower()] except (KeyError, AttributeError): - raise ValueError("invalid unit abbreviation: {unit}" - .format(unit=unit)) + raise ValueError(f"invalid unit abbreviation: {unit}") # ---------------------------------------------------------------------- # Timedelta ops utilities @@ -727,8 +723,7 @@ cdef _to_py_int_float(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {typ}. Must be int or " - "float.".format(typ=type(v))) + raise TypeError(f"Invalid type {type(v)}. Must be int or float.") # Similar to Timestamp/datetime, this is a construction requirement for @@ -773,10 +768,9 @@ cdef class _Timedelta(timedelta): elif op == Py_NE: return True # only allow ==, != ops - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type ' + f'{type(self).__name__} with ' + f'type {type(other).__name__}') if util.is_array(other): return PyObject_RichCompare(np.array([self]), other, op) return PyObject_RichCompare(other, self, reverse_ops[op]) @@ -787,10 +781,8 @@ cdef class _Timedelta(timedelta): return False elif op == Py_NE: return True - raise TypeError('Cannot compare type {cls} with ' - 'type {other}' - .format(cls=type(self).__name__, - other=type(other).__name__)) + raise TypeError(f'Cannot compare type {type(self).__name__} with ' + f'type {type(other).__name__}') return cmp_scalar(self.value, ots.value, op) @@ -841,15 +833,15 @@ cdef class _Timedelta(timedelta): """ return timedelta(microseconds=int(self.value) / 1000) - def to_timedelta64(self): + def to_timedelta64(self) -> np.timedelta64: """ Return a numpy.timedelta64 object with 'ns' precision. """ return np.timedelta64(self.value, 'ns') - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.timedelta64: """ - Convert the Timestamp to a NumPy timedelta64. + Convert the Timedelta to a NumPy timedelta64. .. versionadded:: 0.25.0 @@ -920,7 +912,7 @@ cdef class _Timedelta(timedelta): return self.value @property - def asm8(self): + def asm8(self) -> np.timedelta64: """ Return a numpy timedelta64 array scalar view. @@ -955,7 +947,7 @@ cdef class _Timedelta(timedelta): return np.int64(self.value).view('m8[ns]') @property - def resolution_string(self): + def resolution_string(self) -> str: """ Return a string representing the lowest timedelta resolution. @@ -1012,56 +1004,6 @@ cdef class _Timedelta(timedelta): else: return "D" - @property - def resolution(self): - """ - Return a string representing the lowest timedelta resolution. - - Each timedelta has a defined resolution that represents the lowest OR - most granular level of precision. Each level of resolution is - represented by a short string as defined below: - - Resolution: Return value - - * Days: 'D' - * Hours: 'H' - * Minutes: 'T' - * Seconds: 'S' - * Milliseconds: 'L' - * Microseconds: 'U' - * Nanoseconds: 'N' - - Returns - ------- - str - Timedelta resolution. - - Examples - -------- - >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') - >>> td.resolution - 'N' - - >>> td = pd.Timedelta('1 days 2 min 3 us') - >>> td.resolution - 'U' - - >>> td = pd.Timedelta('2 min 3 s') - >>> td.resolution - 'S' - - >>> td = pd.Timedelta(36, unit='us') - >>> td.resolution - 'U' - """ - # See GH#21344 - warnings.warn("Timedelta.resolution is deprecated, in a future " - "version will behave like the standard library " - "datetime.timedelta.resolution attribute. " - "Use Timedelta.resolution_string instead.", - FutureWarning) - return self.resolution_string - @property def nanoseconds(self): """ @@ -1095,7 +1037,7 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._ns - def _repr_base(self, format=None): + def _repr_base(self, format=None) -> str: """ Parameters @@ -1142,16 +1084,17 @@ cdef class _Timedelta(timedelta): return fmt.format(**comp_dict) - def __repr__(self): - return "Timedelta('{val}')".format(val=self._repr_base(format='long')) + def __repr__(self) -> str: + repr_based = self._repr_base(format='long') + return f"Timedelta('{repr_based}')" - def __str__(self): + def __str__(self) -> str: return self._repr_base(format='long') - def __bool__(self): + def __bool__(self) -> bool: return self.value != 0 - def isoformat(self): + def isoformat(self) -> str: """ Format Timedelta as ISO 8601 Duration like ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the @@ -1189,14 +1132,14 @@ cdef class _Timedelta(timedelta): 'P500DT12H0MS' """ components = self.components - seconds = '{}.{:0>3}{:0>3}{:0>3}'.format(components.seconds, - components.milliseconds, - components.microseconds, - components.nanoseconds) + seconds = (f'{components.seconds}.' + f'{components.milliseconds:0>3}' + f'{components.microseconds:0>3}' + f'{components.nanoseconds:0>3}') # Trim unnecessary 0s, 1.000000000 -> 1 seconds = seconds.rstrip('0').rstrip('.') - tpl = ('P{td.days}DT{td.hours}H{td.minutes}M{seconds}S' - .format(td=components, seconds=seconds)) + tpl = (f'P{components.days}DT{components.hours}' + f'H{components.minutes}M{seconds}S') return tpl @@ -1260,9 +1203,10 @@ class Timedelta(_Timedelta): "milliseconds, microseconds, nanoseconds]") if unit in {'Y', 'y', 'M'}: - warnings.warn("M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, stacklevel=1) + raise ValueError( + "Units 'M' and 'Y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." + ) if isinstance(value, Timedelta): value = value.value @@ -1276,7 +1220,7 @@ class Timedelta(_Timedelta): value = convert_to_timedelta64(value, 'ns') elif is_timedelta64_object(value): if unit is not None: - value = value.astype('timedelta64[{0}]'.format(unit)) + value = value.astype(f'timedelta64[{unit}]') value = value.astype('timedelta64[ns]') elif hasattr(value, 'delta'): value = np.timedelta64(delta_to_nanoseconds(value.delta), 'ns') @@ -1288,9 +1232,8 @@ class Timedelta(_Timedelta): return NaT else: raise ValueError( - "Value must be Timedelta, string, integer, " - "float, timedelta or convertible, not {typ}" - .format(typ=type(value).__name__)) + f"Value must be Timedelta, string, integer, " + f"float, timedelta or convertible, not {type(value).__name__}") if is_timedelta64_object(value): value = value.view('i8') @@ -1485,9 +1428,7 @@ class Timedelta(_Timedelta): else: return self.to_timedelta64() // other - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_integer_object(other) or is_float_object(other): return Timedelta(self.value // other, unit='ns') @@ -1518,21 +1459,9 @@ class Timedelta(_Timedelta): if other.dtype.kind == 'm': # also timedelta-like return _broadcast_floordiv_td64(self.value, other, _rfloordiv) - elif other.dtype.kind == 'i': - # Backwards compatibility - # GH-19761 - msg = textwrap.dedent("""\ - Floor division between integer array and Timedelta is - deprecated. Use 'array // timedelta.value' instead. - If you want to obtain epochs from an array of timestamps, - you can rather use - '(array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'. - """) - warnings.warn(msg, FutureWarning) - return other // self.value - raise TypeError('Invalid dtype {dtype} for ' - '{op}'.format(dtype=other.dtype, - op='__floordiv__')) + + # Includes integer array // Timedelta, disallowed in GH#19761 + raise TypeError(f'Invalid dtype {other.dtype} for __floordiv__') elif is_float_object(other) and util.is_nan(other): # i.e. np.nan @@ -1555,8 +1484,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') return self.__rdivmod__(other)[1] def __divmod__(self, other): @@ -1569,8 +1497,7 @@ class Timedelta(_Timedelta): if hasattr(other, 'dtype') and other.dtype.kind == 'i': # TODO: Remove this check with backwards-compat shim # for integer / Timedelta is removed. - raise TypeError("Invalid type {dtype} for " - "{op}".format(dtype=other.dtype, op='__mod__')) + raise TypeError(f'Invalid dtype {other.dtype} for __mod__') div = other // self return div, other - div * self @@ -1625,3 +1552,4 @@ cdef _broadcast_floordiv_td64(int64_t value, object other, # resolution in ns Timedelta.min = Timedelta(np.iinfo(np.int64).min + 1) Timedelta.max = Timedelta(np.iinfo(np.int64).max) +Timedelta.resolution = Timedelta(nanoseconds=1) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 50a71d062c63f..abe7f9e5b4105 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -36,7 +36,6 @@ from pandas._libs.tslibs.tzconversion import ( # Constants _zero_time = datetime_time(0, 0) _no_input = object() -PY36 = sys.version_info >= (3, 6) # ---------------------------------------------------------------------- @@ -90,23 +89,23 @@ class RoundTo: https://en.wikipedia.org/wiki/Rounding#Round_half_to_even """ @property - def MINUS_INFTY(self): + def MINUS_INFTY(self) -> int: return 0 @property - def PLUS_INFTY(self): + def PLUS_INFTY(self) -> int: return 1 @property - def NEAREST_HALF_EVEN(self): + def NEAREST_HALF_EVEN(self) -> int: return 2 @property - def NEAREST_HALF_PLUS_INFTY(self): + def NEAREST_HALF_PLUS_INFTY(self) -> int: return 3 @property - def NEAREST_HALF_MINUS_INFTY(self): + def NEAREST_HALF_MINUS_INFTY(self) -> int: return 4 @@ -242,8 +241,8 @@ class Timestamp(_Timestamp): """ Timestamp.fromordinal(ordinal, freq=None, tz=None) - passed an ordinal, translate and convert to a ts - note: by definition there cannot be any tz info on the ordinal itself + Passed an ordinal, translate and convert to a ts. + Note: by definition there cannot be any tz info on the ordinal itself. Parameters ---------- @@ -325,7 +324,7 @@ class Timestamp(_Timestamp): Function is not implemented. Use pd.to_datetime(). """ - raise NotImplementedError("Timestamp.strptime() is not implmented." + raise NotImplementedError("Timestamp.strptime() is not implemented." "Use to_datetime() to parse date strings.") @classmethod @@ -333,15 +332,26 @@ class Timestamp(_Timestamp): """ Timestamp.combine(date, time) - date, time -> datetime with same date and time fields + date, time -> datetime with same date and time fields. """ return cls(datetime.combine(date, time)) - def __new__(cls, object ts_input=_no_input, - object freq=None, tz=None, unit=None, - year=None, month=None, day=None, - hour=None, minute=None, second=None, microsecond=None, - nanosecond=None, tzinfo=None): + def __new__( + cls, + object ts_input=_no_input, + object freq=None, + tz=None, + unit=None, + year=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + microsecond=None, + nanosecond=None, + tzinfo=None + ): # The parameter list folds together legacy parameter names (the first # four) and positional and keyword parameter names from pydatetime. # @@ -371,8 +381,8 @@ class Timestamp(_Timestamp): if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError('tzinfo must be a datetime.tzinfo object, ' - 'not %s' % type(tzinfo)) + raise TypeError(f'tzinfo must be a datetime.tzinfo object, ' + f'not {type(tzinfo)}') elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') @@ -402,9 +412,8 @@ class Timestamp(_Timestamp): freq = None if getattr(ts_input, 'tzinfo', None) is not None and tz is not None: - warnings.warn("Passing a datetime or Timestamp with tzinfo and the" - " tz parameter will raise in the future. Use" - " tz_convert instead.", FutureWarning) + raise ValueError("Cannot pass a datetime or Timestamp with tzinfo with " + "the tz parameter. Use tz_convert instead.") ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0) @@ -601,11 +610,11 @@ timedelta}, default 'raise' @property def dayofweek(self): """ - Return day of whe week. + Return day of the week. """ return self.weekday() - def day_name(self, locale=None): + def day_name(self, locale=None) -> str: """ Return the day name of the Timestamp with specified locale. @@ -622,7 +631,7 @@ timedelta}, default 'raise' """ return self._get_date_name_field('day_name', locale) - def month_name(self, locale=None): + def month_name(self, locale=None) -> str: """ Return the month name of the Timestamp with specified locale. @@ -639,17 +648,6 @@ timedelta}, default 'raise' """ return self._get_date_name_field('month_name', locale) - @property - def weekday_name(self): - """ - .. deprecated:: 0.23.0 - Use ``Timestamp.day_name()`` instead - """ - warnings.warn("`weekday_name` is deprecated and will be removed in a " - "future version. Use `day_name` instead", - FutureWarning) - return self.day_name() - @property def dayofyear(self): """ @@ -658,7 +656,7 @@ timedelta}, default 'raise' return ccalendar.get_day_of_year(self.year, self.month, self.day) @property - def week(self): + def week(self) -> int: """ Return the week number of the year. """ @@ -667,7 +665,7 @@ timedelta}, default 'raise' weekofyear = week @property - def quarter(self): + def quarter(self) -> int: """ Return the quarter of the year. """ @@ -690,7 +688,7 @@ timedelta}, default 'raise' return getattr(self.freq, 'freqstr', self.freq) @property - def is_month_start(self): + def is_month_start(self) -> bool: """ Return True if date is first day of month. """ @@ -700,7 +698,7 @@ timedelta}, default 'raise' return self._get_start_end_field('is_month_start') @property - def is_month_end(self): + def is_month_end(self) -> bool: """ Return True if date is last day of month. """ @@ -710,7 +708,7 @@ timedelta}, default 'raise' return self._get_start_end_field('is_month_end') @property - def is_quarter_start(self): + def is_quarter_start(self) -> bool: """ Return True if date is first day of the quarter. """ @@ -720,7 +718,7 @@ timedelta}, default 'raise' return self._get_start_end_field('is_quarter_start') @property - def is_quarter_end(self): + def is_quarter_end(self) -> bool: """ Return True if date is last day of the quarter. """ @@ -730,7 +728,7 @@ timedelta}, default 'raise' return self._get_start_end_field('is_quarter_end') @property - def is_year_start(self): + def is_year_start(self) -> bool: """ Return True if date is first day of the year. """ @@ -740,7 +738,7 @@ timedelta}, default 'raise' return self._get_start_end_field('is_year_start') @property - def is_year_end(self): + def is_year_end(self) -> bool: """ Return True if date is last day of the year. """ @@ -750,23 +748,13 @@ timedelta}, default 'raise' return self._get_start_end_field('is_year_end') @property - def is_leap_year(self): + def is_leap_year(self) -> bool: """ Return True if year is a leap year. """ return bool(ccalendar.is_leapyear(self.year)) - @property - def resolution(self): - """ - Return resolution describing the smallest difference between two - times that can be represented by Timestamp object_state. - """ - # GH#21336, GH#21365 - return Timedelta(nanoseconds=1) - - def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', - errors=None): + def tz_localize(self, tz, ambiguous='raise', nonexistent='raise'): """ Convert naive Timestamp to local time zone, or remove timezone from tz-aware Timestamp. @@ -809,18 +797,6 @@ default 'raise' nonexistent times. .. versionadded:: 0.24.0 - errors : 'raise', 'coerce', default None - Determine how errors should be handled. - - The behavior is as follows: - - * 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - * 'coerce' will return NaT if the timestamp can not be converted - into the specified timezone. Use ``nonexistent='NaT'`` instead. - - .. deprecated:: 0.24.0 Returns ------- @@ -834,26 +810,13 @@ default 'raise' if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') - if errors is not None: - warnings.warn("The errors argument is deprecated and will be " - "removed in a future release. Use " - "nonexistent='NaT' or nonexistent='raise' " - "instead.", FutureWarning) - if errors == 'coerce': - nonexistent = 'NaT' - elif errors == 'raise': - nonexistent = 'raise' - else: - raise ValueError("The errors argument must be either 'coerce' " - "or 'raise'.") - nonexistent_options = ('raise', 'NaT', 'shift_forward', 'shift_backward') if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + raise ValueError("The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object") if self.tzinfo is None: # tz naive, localize @@ -947,8 +910,8 @@ default 'raise' def validate(k, v): """ validate integers """ if not is_integer_object(v): - raise ValueError("value must be an integer, received " - "{v} for {k}".format(v=type(v), k=k)) + raise ValueError(f"value must be an integer, received " + f"{type(v)} for {k}") return v if year is not None: @@ -982,9 +945,8 @@ default 'raise' else: kwargs = {'year': dts.year, 'month': dts.month, 'day': dts.day, 'hour': dts.hour, 'minute': dts.min, 'second': dts.sec, - 'microsecond': dts.us, 'tzinfo': _tzinfo} - if PY36: - kwargs['fold'] = fold + 'microsecond': dts.us, 'tzinfo': _tzinfo, + 'fold': fold} ts_input = datetime(**kwargs) ts = convert_datetime_to_tsobject(ts_input, _tzinfo) @@ -1005,13 +967,13 @@ default 'raise' base1, base2 = base, "" if self.microsecond != 0: - base1 += "%.3d" % self.nanosecond + base1 += f"{self.nanosecond:03d}" else: - base1 += ".%.9d" % self.nanosecond + base1 += f".{self.nanosecond:09d}" return base1 + base2 - def _has_time_component(self): + def _has_time_component(self) -> bool: """ Returns if the Timestamp has a time component in addition to the date part @@ -1075,3 +1037,4 @@ cdef int64_t _NS_LOWER_BOUND = -9223372036854775000 # Resolution is in nanoseconds Timestamp.min = Timestamp(_NS_LOWER_BOUND) Timestamp.max = Timestamp(_NS_UPPER_BOUND) +Timestamp.resolution = Timedelta(nanoseconds=1) # GH#21336, GH#21365 diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index bc1fdfae99de9..35ee87e714fa8 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -280,8 +280,8 @@ def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo if not tz_compare(tz, end.tzinfo): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + raise AssertionError(f'Inputs must both have the same timezone, ' + f'{tz} != {end.tzinfo}') elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index dd0c6fc75b06f..b368f0fde3edc 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -175,8 +175,8 @@ timedelta-like} if trans_idx.size == 1: stamp = _render_tstamp(vals[trans_idx]) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %s as there " - "are no repeated times".format(stamp)) + f"Cannot infer dst time from {stamp} as there " + f"are no repeated times") # Split the array into contiguous chunks (where the difference between # indices is 1). These are effectively dst transitions in different # years which is useful for checking that there is not an ambiguous @@ -200,8 +200,8 @@ timedelta-like} switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: raise pytz.AmbiguousTimeError( - "There are %i dst switches when " - "there should only be 1.".format(switch_idx.size)) + f"There are {switch_idx.size} dst switches when " + f"there should only be 1.") switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] @@ -230,8 +230,8 @@ timedelta-like} else: stamp = _render_tstamp(val) raise pytz.AmbiguousTimeError( - "Cannot infer dst time from %r, try using the " - "'ambiguous' argument".format(stamp)) + f"Cannot infer dst time from {stamp}, try using the " + f"'ambiguous' argument") elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -246,8 +246,8 @@ timedelta-like} # time if -1 < shift_delta + remaining_mins < HOURS_NS: raise ValueError( - "The provided timedelta will relocalize on a " - "nonexistent time: {}".format(nonexistent) + f"The provided timedelta will relocalize on a " + f"nonexistent time: {nonexistent}" ) new_local = val + shift_delta elif shift_forward: diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 63cbd36f9cd1d..936532a81c6d6 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -218,7 +218,7 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline const char* get_c_string_buf_and_size(object py_string, +cdef inline const char* get_c_string_buf_and_size(str py_string, Py_ssize_t *length): """ Extract internal char* buffer of unicode or bytes object `py_string` with @@ -231,7 +231,7 @@ cdef inline const char* get_c_string_buf_and_size(object py_string, Parameters ---------- - py_string : object + py_string : str length : Py_ssize_t* Returns @@ -241,12 +241,9 @@ cdef inline const char* get_c_string_buf_and_size(object py_string, cdef: const char *buf - if PyUnicode_Check(py_string): - buf = PyUnicode_AsUTF8AndSize(py_string, length) - else: - PyBytes_AsStringAndSize(py_string, &buf, length) + buf = PyUnicode_AsUTF8AndSize(py_string, length) return buf -cdef inline const char* get_c_string(object py_string): +cdef inline const char* get_c_string(str py_string): return get_c_string_buf_and_size(py_string, NULL) diff --git a/pandas/tests/io/msgpack/__init__.py b/pandas/_libs/window/__init__.py similarity index 100% rename from pandas/tests/io/msgpack/__init__.py rename to pandas/_libs/window/__init__.py diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window/aggregations.pyx similarity index 53% rename from pandas/_libs/window.pyx rename to pandas/_libs/window/aggregations.pyx index a2096d389823f..0348843abc129 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -18,12 +18,31 @@ cdef extern from "src/headers/cmath" namespace "std": int signbit(float64_t) nogil float64_t sqrt(float64_t x) nogil -cimport pandas._libs.util as util +from pandas._libs.algos import is_monotonic + from pandas._libs.util cimport numeric -from pandas._libs.skiplist cimport ( - skiplist_t, skiplist_init, skiplist_destroy, skiplist_get, skiplist_insert, - skiplist_remove) +cdef extern from "../src/skiplist.h": + ctypedef struct node_t: + node_t **next + int *width + double value + int is_nil + int levels + int ref_count + + ctypedef struct skiplist_t: + node_t *head + node_t **tmp_chain + int *tmp_steps + int size + int maxlevels + + skiplist_t* skiplist_init(int) nogil + void skiplist_destroy(skiplist_t*) nogil + double skiplist_get(skiplist_t*, int, int*) nogil + int skiplist_insert(skiplist_t*, double) nogil + int skiplist_remove(skiplist_t*, double) nogil cdef: float32_t MINfloat32 = np.NINF @@ -37,6 +56,9 @@ cdef: cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b +cdef inline bint is_monotonic_start_end_bounds(ndarray[int64_t, ndim=1] start, + ndarray[int64_t, ndim=1] end): + return is_monotonic(start, False)[0] and is_monotonic(end, False)[0] # Cython implementations of rolling sum, mean, variance, skewness, # other statistical moment functions @@ -48,39 +70,6 @@ cdef inline int int_min(int a, int b): return a if a <= b else b # periodically revisited to see if it's still true. # - -def _check_minp(win, minp, N, floor=None): - """ - Parameters - ---------- - win: int - minp: int or None - N: len of window - floor: int, optional - default 1 - - Returns - ------- - minimum period - """ - - if minp is None: - minp = 1 - if not util.is_integer_object(minp): - raise ValueError("min_periods must be an integer") - if minp > win: - raise ValueError("min_periods (%d) must be <= " - "window (%d)" % (minp, win)) - elif minp > N: - minp = N + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') - if floor is None: - floor = 1 - - return max(minp, floor) - - # original C implementation by N. Devillard. # This code in public domain. # Function : kth_smallest() @@ -96,280 +85,19 @@ def _check_minp(win, minp, N, floor=None): # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# The indexer objects for rolling -# These define start/end indexers to compute offsets - - -cdef class WindowIndexer: - - cdef: - ndarray start, end - int64_t N, minp, win - bint is_variable - - def get_data(self): - return (self.start, self.end, self.N, - self.win, self.minp, - self.is_variable) - - -cdef class MockFixedWindowIndexer(WindowIndexer): - """ - - We are just checking parameters of the indexer, - and returning a consistent API with fixed/variable - indexers. - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - self.start = np.empty(0, dtype='int64') - self.end = np.empty(0, dtype='int64') - self.win = win - - -cdef class FixedWindowIndexer(WindowIndexer): - """ - create a fixed length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: object - index of the values - floor: optional - unit for flooring the unit - left_closed: bint - left endpoint closedness - right_closed: bint - right endpoint closedness - - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, - object index=None, object floor=None): - cdef ndarray start_s, start_e, end_s, end_e - - assert index is None - self.is_variable = 0 - self.N = len(values) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - start_s = np.zeros(win, dtype='int64') - start_e = np.arange(win, self.N, dtype='int64') - win + 1 - self.start = np.concatenate([start_s, start_e]) - - end_s = np.arange(win, dtype='int64') + 1 - end_e = start_e + win - self.end = np.concatenate([end_s, end_e]) - self.win = win - - -cdef class VariableWindowIndexer(WindowIndexer): - """ - create a variable length window indexer object - that has start & end, that point to offsets in - the index object; these are defined based on the win - arguments - - Parameters - ---------- - values: ndarray - values data array - win: int64_t - window size - minp: int64_t - min number of obs in a window to consider non-NaN - index: ndarray - index of the values - left_closed: bint - left endpoint closedness - True if the left endpoint is closed, False if open - right_closed: bint - right endpoint closedness - True if the right endpoint is closed, False if open - floor: optional - unit for flooring the unit - """ - def __init__(self, ndarray values, int64_t win, int64_t minp, - bint left_closed, bint right_closed, ndarray index, - object floor=None): - - self.is_variable = 1 - self.N = len(index) - self.minp = _check_minp(win, minp, self.N, floor=floor) - - self.start = np.empty(self.N, dtype='int64') - self.start.fill(-1) - - self.end = np.empty(self.N, dtype='int64') - self.end.fill(-1) - - self.build(index, win, left_closed, right_closed) - - # max window size - self.win = (self.end - self.start).max() - - def build(self, const int64_t[:] index, int64_t win, bint left_closed, - bint right_closed): - - cdef: - ndarray[int64_t] start, end - int64_t start_bound, end_bound, N - Py_ssize_t i, j - - start = self.start - end = self.end - N = self.N - - start[0] = 0 - - # right endpoint is closed - if right_closed: - end[0] = 1 - # right endpoint is open - else: - end[0] = 0 - - with nogil: - - # start is start of slice interval (including) - # end is end of slice interval (not including) - for i in range(1, N): - end_bound = index[i] - start_bound = index[i] - win - - # left endpoint is closed - if left_closed: - start_bound -= 1 - - # advance the start bound until we are - # within the constraint - start[i] = i - for j in range(start[i - 1], i): - if index[j] > start_bound: - start[i] = j - break - - # end bound is previous end - # or current index - if index[end[i - 1]] <= end_bound: - end[i] = i + 1 - else: - end[i] = end[i - 1] - - # right endpoint is open - if not right_closed: - end[i] -= 1 - - -def get_window_indexer(values, win, minp, index, closed, - floor=None, use_mock=True): - """ - return the correct window indexer for the computation - - Parameters - ---------- - values: 1d ndarray - win: integer, window size - minp: integer, minimum periods - index: 1d ndarray, optional - index to the values array - closed: string, default None - {'right', 'left', 'both', 'neither'} - window endpoint closedness. Defaults to 'right' in - VariableWindowIndexer and to 'both' in FixedWindowIndexer - floor: optional - unit for flooring the unit - use_mock: boolean, default True - if we are a fixed indexer, return a mock indexer - instead of the FixedWindow Indexer. This is a type - compat Indexer that allows us to use a standard - code path with all of the indexers. - - - Returns - ------- - tuple of 1d int64 ndarrays of the offsets & data about the window - - """ - - cdef: - bint left_closed = False - bint right_closed = False - - assert closed is None or closed in ['right', 'left', 'both', 'neither'] - - # if windows is variable, default is 'right', otherwise default is 'both' - if closed is None: - closed = 'right' if index is not None else 'both' - - if closed in ['right', 'both']: - right_closed = True - - if closed in ['left', 'both']: - left_closed = True - - if index is not None: - indexer = VariableWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - elif use_mock: - indexer = MockFixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - else: - indexer = FixedWindowIndexer(values, win, minp, left_closed, - right_closed, index, floor) - return indexer.get_data() - - # ---------------------------------------------------------------------- # Rolling count # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_count(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp): cdef: float64_t val, count_x = 0.0 - int64_t s, e, nobs, N + int64_t s, e, nobs, N = len(values) Py_ssize_t i, j - ndarray[int64_t] start, end ndarray[float64_t] output - start, end, N, win, minp, _ = get_window_indexer(values, win, - minp, index, closed) output = np.empty(N, dtype=float) with nogil: @@ -413,8 +141,7 @@ def roll_count(ndarray[float64_t] values, int64_t win, int64_t minp, # Rolling sum -cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, - float64_t sum_x) nogil: +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil: cdef: float64_t result @@ -435,8 +162,7 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: sum_x[0] = sum_x[0] + val -cdef inline void remove_sum(float64_t val, - int64_t *nobs, float64_t *sum_x) nogil: +cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: """ remove a value from the sum calc """ if notnan(val): @@ -444,80 +170,80 @@ cdef inline void remove_sum(float64_t val, sum_x[0] = sum_x[0] - val -def roll_sum(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev_x, sum_x = 0 - int64_t s, e, range_endpoint - int64_t nobs = 0, i, j, N - bint is_variable - ndarray[int64_t] start, end + float64_t sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output + bint is_monotonic_bounds - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed, - floor=0) + is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: - if is_variable: + for i in range(0, N): + s = start[i] + e = end[i] - # variable window - with nogil: + if i == 0 or not is_monotonic_bounds: - for i in range(0, N): - s = start[i] - e = end[i] + # setup - if i == 0: + for j in range(s, e): + add_sum(values[j], &nobs, &sum_x) - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + else: - else: + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(values[j], &nobs, &sum_x) - # calculate deletes - for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + # calculate adds + for j in range(end[i - 1], e): + add_sum(values[j], &nobs, &sum_x) - # calculate adds - for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + output[i] = calc_sum(minp, nobs, sum_x) - output[i] = calc_sum(minp, nobs, sum_x) + if not is_monotonic_bounds: + for j in range(s, e): + remove_sum(values[j], &nobs, &sum_x) - else: + return output - # fixed window - range_endpoint = int_max(minp, 1) - 1 +def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): + cdef: + float64_t val, prev_x, sum_x = 0 + int64_t range_endpoint + int64_t nobs = 0, i, N = len(values) + ndarray[float64_t] output - with nogil: + output = np.empty(N, dtype=float) - for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) - output[i] = NaN + range_endpoint = int_max(minp, 1) - 1 + + with nogil: + + for i in range(0, range_endpoint): + add_sum(values[i], &nobs, &sum_x) + output[i] = NaN - for i in range(range_endpoint, N): - val = values[i] - add_sum(val, &nobs, &sum_x) + for i in range(range_endpoint, N): + val = values[i] + add_sum(val, &nobs, &sum_x) - if i > win - 1: - prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) + if i > win - 1: + prev_x = values[i - win] + remove_sum(prev_x, &nobs, &sum_x) - output[i] = calc_sum(minp, nobs, sum_x) + output[i] = calc_sum(minp, nobs, sum_x) return output - # ---------------------------------------------------------------------- # Rolling mean @@ -565,76 +291,78 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, neg_ct[0] = neg_ct[0] - 1 -def roll_mean(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, result, sum_x = 0 - int64_t s, e - bint is_variable - Py_ssize_t nobs = 0, i, j, neg_ct = 0, N - ndarray[int64_t] start, end + float64_t val, prev_x, sum_x = 0 + Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we have 2 paths - # but is faster + with nogil: + for i in range(minp - 1): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN - if is_variable: + for i in range(minp - 1, N): + val = values[i] + add_mean(val, &nobs, &sum_x, &neg_ct) - with nogil: + if i > win - 1: + prev_x = values[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) - for i in range(0, N): - s = start[i] - e = end[i] + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - if i == 0: + return output - # setup - sum_x = 0.0 - nobs = 0 - for j in range(s, e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) - else: +def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, sum_x = 0 + int64_t s, e + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) + ndarray[float64_t] output + bint is_monotonic_bounds - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + output = np.empty(N, dtype=float) - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + with nogil: - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + for i in range(0, N): + s = start[i] + e = end[i] - else: + if i == 0 or not is_monotonic_bounds: - with nogil: - for i in range(minp - 1): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = NaN + # setup + for j in range(s, e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - for i in range(minp - 1, N): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + else: - if i > win - 1: - prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_mean(val, &nobs, &sum_x, &neg_ct) - return output + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + if not is_monotonic_bounds: + for j in range(s, e): + val = values[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) + return output # ---------------------------------------------------------------------- # Rolling variance @@ -698,8 +426,8 @@ cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, ssqdm_x[0] = 0 -def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed, int ddof=1): +def roll_var_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, int ddof=1): """ Numerically stable implementation using Welford's method. """ @@ -707,98 +435,108 @@ def roll_var(ndarray[float64_t] values, int64_t win, int64_t minp, float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, float64_t val, prev, delta, mean_x_old int64_t s, e - bint is_variable - Py_ssize_t i, j, N - ndarray[int64_t] start, end + Py_ssize_t i, j, N = len(values) ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) # Check for windows larger than array, addresses #7297 win = min(win, N) - # for performance we are going to iterate - # fixed windows separately, makes the code more complex as we - # have 2 paths but is faster + with nogil: - if is_variable: + # Over the first window, observations can only be added, never + # removed + for i in range(win): + add_var(values[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - with nogil: + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - for i in range(0, N): + # After the first window, observations can both be added and + # removed + for i in range(win, N): + val = values[i] + prev = values[i - win] - s = start[i] - e = end[i] + if notnan(val): + if prev == prev: - # Over the first window, observations can only be added - # never removed - if i == 0: + # Adding one observation and removing another one + delta = val - prev + mean_x_old = mean_x - for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + mean_x += delta / nobs + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) - # After the first window, observations can both be added - # and removed + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - # calculate adds - for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + return output - # calculate deletes - for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) +def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t val, prev, delta, mean_x_old + int64_t s, e + Py_ssize_t i, j, N = len(values) + ndarray[float64_t] output + bint is_monotonic_bounds - else: + is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + output = np.empty(N, dtype=float) - with nogil: + with nogil: - # Over the first window, observations can only be added, never - # removed - for i in range(win): - add_var(values[i], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + for i in range(0, N): - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + s = start[i] + e = end[i] - # After the first window, observations can both be added and - # removed - for i in range(win, N): - val = values[i] - prev = values[i - win] + # Over the first window, observations can only be added + # never removed + if i == 0 or not is_monotonic_bounds: - if notnan(val): - if prev == prev: + for j in range(s, e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - # Adding one observation and removing another one - delta = val - prev - mean_x_old = mean_x + else: - mean_x += delta / nobs - ssqdm_x += ((nobs - 1) * val - + (nobs + 1) * prev - - 2 * nobs * mean_x_old) * delta / nobs + # After the first window, observations can both be added + # and removed - else: - add_var(val, &nobs, &mean_x, &ssqdm_x) - elif prev == prev: - remove_var(prev, &nobs, &mean_x, &ssqdm_x) + # calculate adds + for j in range(end[i - 1], e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # calculate deletes + for j in range(start[i - 1], s): + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) - return output + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + if not is_monotonic_bounds: + for j in range(s, e): + remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + + return output # ---------------------------------------------------------------------- # Rolling skewness + cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, float64_t x, float64_t xx, float64_t xxx) nogil: @@ -863,76 +601,87 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, xxx[0] = xxx[0] - val * val * val -def roll_skew(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_skew_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - ndarray[int64_t] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: + for i in range(minp - 1): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN - with nogil: + for i in range(minp - 1, N): + val = values[i] + add_skew(val, &nobs, &x, &xx, &xxx) - for i in range(0, N): + if i > win - 1: + prev = values[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) - s = start[i] - e = end[i] + output[i] = calc_skew(minp, nobs, x, xx, xxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + return output - for j in range(s, e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - else: +def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N = len(values) + int64_t s, e + ndarray[float64_t] output + bint is_monotonic_bounds - # After the first window, observations can both be added - # and removed + is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + output = np.empty(N, dtype=float) - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + with nogil: - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + for i in range(0, N): - output[i] = calc_skew(minp, nobs, x, xx, xxx) + s = start[i] + e = end[i] - else: + # Over the first window, observations can only be added + # never removed + if i == 0 or not is_monotonic_bounds: - with nogil: - for i in range(minp - 1): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = NaN + for j in range(s, e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - for i in range(minp - 1, N): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) + else: - if i > win - 1: - prev = values[i - win] - remove_skew(prev, &nobs, &x, &xx, &xxx) + # After the first window, observations can both be added + # and removed - output[i] = calc_skew(minp, nobs, x, xx, xxx) + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + add_skew(val, &nobs, &x, &xx, &xxx) - return output + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + if not is_monotonic_bounds: + for j in range(s, e): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + return output # ---------------------------------------------------------------------- # Rolling kurtosis @@ -1007,69 +756,79 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, xxxx[0] = xxxx[0] - val * val * val * val -def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_kurt_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, prev float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, N + int64_t nobs = 0, i, j, N = len(values) int64_t s, e - bint is_variable - ndarray[int64_t] start, end ndarray[float64_t] output - start, end, N, win, minp, is_variable = get_window_indexer(values, win, - minp, index, - closed) output = np.empty(N, dtype=float) - if is_variable: + with nogil: - with nogil: + for i in range(minp - 1): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN - for i in range(0, N): + for i in range(minp - 1, N): + add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - s = start[i] - e = end[i] + if i > win - 1: + prev = values[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) - # Over the first window, observations can only be added - # never removed - if i == 0: + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + return output - else: - # After the first window, observations can both be added - # and removed +def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + cdef: + float64_t val, prev + float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, s, e, N = len(values) + ndarray[float64_t] output + bint is_monotonic_bounds - # calculate adds - for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + output = np.empty(N, dtype=float) - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + with nogil: - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + for i in range(0, N): - else: + s = start[i] + e = end[i] - with nogil: + # Over the first window, observations can only be added + # never removed + if i == 0 or not is_monotonic_bounds: - for i in range(minp - 1): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - output[i] = NaN + for j in range(s, e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - for i in range(minp - 1, N): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) + else: - if i > win - 1: - prev = values[i - win] - remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + # After the first window, observations can both be added + # and removed - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + # calculate adds + for j in range(end[i - 1], e): + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + if not is_monotonic_bounds: + for j in range(s, e): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) return output @@ -1078,31 +837,26 @@ def roll_kurt(ndarray[float64_t] values, int64_t win, int64_t minp, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, - object index, object closed): +def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): cdef: float64_t val, res, prev - bint err = 0, is_variable + bint err = 0 int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N, s, e + int64_t nobs = 0, N = len(values), s, e int midpoint - ndarray[int64_t] start, end ndarray[float64_t] output # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() sl = skiplist_init(win) if sl == NULL: raise MemoryError("skiplist_init failed") @@ -1164,7 +918,7 @@ def roll_median_c(ndarray[float64_t] values, int64_t win, int64_t minp, # Moving maximum / minimum code taken from Bottleneck under the terms # of its Simplified BSD license -# https://github.com/kwgoodman/bottleneck +# https://github.com/pydata/bottleneck cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: @@ -1211,76 +965,89 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation - closed: 'right', 'left', 'both', 'neither' + closed : 'right', 'left', 'both', 'neither' make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max(values, win, minp, index, closed=closed, is_max=1) + return _roll_min_max_fixed(values, start, end, minp, win, is_max=1) -def roll_min(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed): +def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. Parameters ---------- - values: numpy array - window: int, size of rolling window - minp: if number of observations in window + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window is below this, output a NaN - index: ndarray, optional + index : ndarray, optional index for window computation + closed : 'right', 'left', 'both', 'neither' + make the interval closed on the right, left, + both or neither endpoints """ - return _roll_min_max(values, win, minp, index, is_max=0, closed=closed) + return _roll_min_max_variable(values, start, end, minp, is_max=1) -cdef _roll_min_max(ndarray[numeric] values, int64_t win, int64_t minp, - object index, object closed, bint is_max): +def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win): """ - Moving min/max of 1d array of any numeric type along axis=0 - ignoring NaNs. + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation """ - cdef: - ndarray[int64_t] starti, endi - int64_t N - bint is_variable + return _roll_min_max_fixed(values, start, end, minp, win, is_max=0) - starti, endi, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed) - if is_variable: - return _roll_min_max_variable(values, starti, endi, N, win, minp, - is_max) - else: - return _roll_min_max_fixed(values, N, win, minp, is_max) +def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + values : np.ndarray[np.float64] + window : int, size of rolling window + minp : if number of observations in window + is below this, output a NaN + index : ndarray, optional + index for window computation + """ + return _roll_min_max_variable(values, start, end, minp, is_max=0) cdef _roll_min_max_variable(ndarray[numeric] values, ndarray[int64_t] starti, ndarray[int64_t] endi, - int64_t N, - int64_t win, int64_t minp, bint is_max): cdef: numeric ai int64_t i, close_offset, curr_win_size - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) deque Q[int64_t] # min/max always the front deque W[int64_t] # track the whole window for nobs compute ndarray[float64_t, ndim=1] output @@ -1355,15 +1122,16 @@ cdef _roll_min_max_variable(ndarray[numeric] values, cdef _roll_min_max_fixed(ndarray[numeric] values, - int64_t N, - int64_t win, + ndarray[int64_t] starti, + ndarray[int64_t] endi, int64_t minp, + int64_t win, bint is_max): cdef: numeric ai bint should_replace int64_t i, removed, window_i, - Py_ssize_t nobs = 0 + Py_ssize_t nobs = 0, N = len(values) int64_t* death numeric* ring numeric* minvalue @@ -1459,8 +1227,8 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, - int64_t minp, object index, object closed, +def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int64_t win, float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list @@ -1468,36 +1236,29 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, cdef: float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N + int64_t nobs = 0, i, j, s, e, N = len(values) Py_ssize_t idx - bint is_variable - ndarray[int64_t] start, end ndarray[float64_t] output float64_t vlow, vhigh InterpolationType interpolation_type int ret = 0 if quantile <= 0.0 or quantile >= 1.0: - raise ValueError("quantile value {0} not in [0, 1]".format(quantile)) + raise ValueError(f"quantile value {quantile} not in [0, 1]") try: interpolation_type = interpolation_types[interpolation] except KeyError: - raise ValueError("Interpolation '{interp}' is not supported" - .format(interp=interpolation)) + raise ValueError(f"Interpolation '{interpolation}' is not supported") # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs - start, end, N, win, minp, is_variable = get_window_indexer( - values, win, - minp, index, closed, - use_mock=False) output = np.empty(N, dtype=float) - if win == 0: + if win == 0 or (end - start).max() == 0: output[:] = NaN return output - + win = (end - start).max() skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") @@ -1578,18 +1339,17 @@ def roll_quantile(ndarray[float64_t, cast=True] values, int64_t win, return output -def roll_generic(object obj, - int64_t win, int64_t minp, object index, object closed, - int offset, object func, bint raw, - object args, object kwargs): +def roll_generic_fixed(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, int64_t win, + int offset, object func, bint raw, + object args, object kwargs): cdef: ndarray[float64_t] output, counts, bufarr ndarray[float64_t, cast=True] arr float64_t *buf float64_t *oldbuf - int64_t nobs = 0, i, j, s, e, N - bint is_variable - ndarray[int64_t] start, end + int64_t nobs = 0, i, j, s, e, N = len(start) n = len(obj) if n == 0: @@ -1602,36 +1362,13 @@ def roll_generic(object obj, if not arr.flags.c_contiguous: arr = arr.copy('C') - counts = roll_sum(np.concatenate([np.isfinite(arr).astype(float), - np.array([0.] * offset)]), - win, minp, index, closed)[offset:] - - start, end, N, win, minp, is_variable = get_window_indexer(arr, win, - minp, index, - closed, - floor=0) + counts = roll_sum_fixed(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp, win)[offset:] output = np.empty(N, dtype=float) - if is_variable: - # variable window arr or series - - if offset != 0: - raise ValueError("unable to roll_generic with a non-zero offset") - - for i in range(0, N): - s = start[i] - e = end[i] - - if counts[i] >= minp: - if raw: - output[i] = func(arr[s:e], *args, **kwargs) - else: - output[i] = func(obj.iloc[s:e], *args, **kwargs) - else: - output[i] = NaN - - elif not raw: + if not raw: # series for i in range(N): if counts[i] >= minp: @@ -1675,22 +1412,68 @@ def roll_generic(object obj, return output +def roll_generic_variable(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, + int offset, object func, bint raw, + object args, object kwargs): + cdef: + ndarray[float64_t] output, counts, bufarr + ndarray[float64_t, cast=True] arr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N = len(start) + + n = len(obj) + if n == 0: + return obj + + arr = np.asarray(obj) + + # ndarray input + if raw: + if not arr.flags.c_contiguous: + arr = arr.copy('C') + + counts = roll_sum_variable(np.concatenate([np.isfinite(arr).astype(float), + np.array([0.] * offset)]), + start, end, minp)[offset:] + + output = np.empty(N, dtype=float) + + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + if raw: + output[i] = func(arr[s:e], *args, **kwargs) + else: + output[i] = func(obj.iloc[s:e], *args, **kwargs) + else: + output[i] = NaN + + return output + + # ---------------------------------------------------------------------- # Rolling sum and mean for weighted window -def roll_weighted_sum(float64_t[:] values, float64_t[:] weights, - int minp): +def roll_weighted_sum(float64_t[:] values, float64_t[:] weights, int minp): return _roll_weighted_sum_mean(values, weights, minp, avg=0) -def roll_weighted_mean(float64_t[:] values, float64_t[:] weights, - int minp): +def roll_weighted_mean(float64_t[:] values, float64_t[:] weights, int minp): return _roll_weighted_sum_mean(values, weights, minp, avg=1) -def _roll_weighted_sum_mean(float64_t[:] values, float64_t[:] weights, - int minp, bint avg): +cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, + float64_t[:] weights, + int minp, bint avg): """ Assume len(weights) << len(values) """ @@ -1702,54 +1485,283 @@ def _roll_weighted_sum_mean(float64_t[:] values, float64_t[:] weights, in_n = len(values) win_n = len(weights) - output = np.zeros(in_n, dtype=float) - counts = np.zeros(in_n, dtype=float) + output = np.zeros(in_n, dtype=np.float64) + counts = np.zeros(in_n, dtype=np.float64) if avg: - tot_wgt = np.zeros(in_n, dtype=float) + tot_wgt = np.zeros(in_n, dtype=np.float64) + + if minp > win_n: + raise ValueError(f"min_periods (minp) must be <= " + f"window (win)") + elif minp > in_n: + minp = in_n + 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') - minp = _check_minp(len(weights), minp, in_n) + minp = max(minp, 1) - if avg: - for win_i in range(win_n): - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = values[in_i] - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - tot_wgt[in_i + (win_n - win_i) - 1] += val_win - - for in_i in range(in_n): - c = counts[in_i] - if c < minp: - output[in_i] = NaN - else: - w = tot_wgt[in_i] - if w == 0: + with nogil: + if avg: + for win_i in range(win_n): + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i in range(in_n - (win_n - win_i) + 1): + val_in = values[in_i] + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + tot_wgt[in_i + (win_n - win_i) - 1] += val_win + + for in_i in range(in_n): + c = counts[in_i] + if c < minp: output[in_i] = NaN else: - output[in_i] /= tot_wgt[in_i] + w = tot_wgt[in_i] + if w == 0: + output[in_i] = NaN + else: + output[in_i] /= tot_wgt[in_i] + + else: + for win_i in range(win_n): + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i in range(in_n - (win_n - win_i) + 1): + val_in = values[in_i] + + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + + for in_i in range(in_n): + c = counts[in_i] + if c < minp: + output[in_i] = NaN + + return np.asarray(output) + +# ---------------------------------------------------------------------- +# Rolling var for weighted window + + +cdef inline float64_t calc_weighted_var(float64_t t, + float64_t sum_w, + Py_ssize_t win_n, + unsigned int ddof, + float64_t nobs, + int64_t minp) nogil: + """ + Calculate weighted variance for a window using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + win_n: Py_ssize_t + window size + ddof: unsigned int + delta degrees of freedom + nobs: float64_t + number of observations + minp: int64_t + minimum number of observations + + Returns + ------- + result : float64_t + weighted variance of the window + """ + + cdef: + float64_t result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = t * win_n / ((win_n - ddof) * sum_w) + if result < 0: + result = 0 else: - for win_i in range(win_n): - val_win = weights[win_i] - if val_win != val_win: - continue + result = NaN + + return result + + +cdef inline void add_weighted_var(float64_t val, + float64_t w, + float64_t *t, + float64_t *sum_w, + float64_t *mean, + float64_t *nobs) nogil: + """ + Update weighted mean, sum of weights and sum of weighted squared + differences to include value and weight pair in weighted variance + calculation using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + val: float64_t + window values + w: float64_t + window weights + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + mean: float64_t + weighted mean + nobs: float64_t + number of observations + """ + + cdef: + float64_t temp, q, r + + if isnan(val): + return + + nobs[0] = nobs[0] + 1 + + q = val - mean[0] + temp = sum_w[0] + w + r = q * w / temp - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = values[in_i] + mean[0] = mean[0] + r + t[0] = t[0] + r * sum_w[0] * q + sum_w[0] = temp - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - for in_i in range(in_n): - c = counts[in_i] - if c < minp: - output[in_i] = NaN +cdef inline void remove_weighted_var(float64_t val, + float64_t w, + float64_t *t, + float64_t *sum_w, + float64_t *mean, + float64_t *nobs) nogil: + """ + Update weighted mean, sum of weights and sum of weighted squared + differences to remove value and weight pair from weighted variance + calculation using West's method. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + val: float64_t + window values + w: float64_t + window weights + t: float64_t + sum of weighted squared differences + sum_w: float64_t + sum of weights + mean: float64_t + weighted mean + nobs: float64_t + number of observations + """ + + cdef: + float64_t temp, q, r + + if notnan(val): + nobs[0] = nobs[0] - 1 + + if nobs[0]: + q = val - mean[0] + temp = sum_w[0] - w + r = q * w / temp + + mean[0] = mean[0] - r + t[0] = t[0] - r * sum_w[0] * q + sum_w[0] = temp + + else: + t[0] = 0 + sum_w[0] = 0 + mean[0] = 0 + + +def roll_weighted_var(float64_t[:] values, float64_t[:] weights, + int64_t minp, unsigned int ddof): + """ + Calculates weighted rolling variance using West's online algorithm. + + Paper: https://dl.acm.org/citation.cfm?id=359153 + + Parameters + ---------- + values: float64_t[:] + values to roll window over + weights: float64_t[:] + array of weights whose length is window size + minp: int64_t + minimum number of observations to calculate + variance of a window + ddof: unsigned int + the divisor used in variance calculations + is the window size - ddof + + Returns + ------- + output: float64_t[:] + weighted variances of windows + """ + + cdef: + float64_t t = 0, sum_w = 0, mean = 0, nobs = 0 + float64_t val, pre_val, w, pre_w + Py_ssize_t i, n, win_n + float64_t[:] output + + n = len(values) + win_n = len(weights) + output = np.empty(n, dtype=float) + + with nogil: + + for i in range(win_n): + add_weighted_var(values[i], weights[i], &t, + &sum_w, &mean, &nobs) + + output[i] = calc_weighted_var(t, sum_w, win_n, + ddof, nobs, minp) + + for i in range(win_n, n): + val = values[i] + pre_val = values[i - win_n] + + w = weights[i % win_n] + pre_w = weights[(i - win_n) % win_n] + + if notnan(val): + if pre_val == pre_val: + remove_weighted_var(pre_val, pre_w, &t, + &sum_w, &mean, &nobs) + + add_weighted_var(val, w, &t, &sum_w, &mean, &nobs) + + elif pre_val == pre_val: + remove_weighted_var(pre_val, pre_w, &t, + &sum_w, &mean, &nobs) + + output[i] = calc_weighted_var(t, sum_w, win_n, + ddof, nobs, minp) return output @@ -1758,8 +1770,7 @@ def _roll_weighted_sum_mean(float64_t[:] values, float64_t[:] weights, # Exponentially weighted moving average -def ewma(float64_t[:] vals, float64_t com, - int adjust, int ignore_na, int minp): +def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1768,12 +1779,12 @@ def ewma(float64_t[:] vals, float64_t com, vals : ndarray (float64 type) com : float64 adjust: int - ignore_na: int + ignore_na: bool minp: int Returns ------- - y : ndarray + ndarray """ cdef: @@ -1781,6 +1792,7 @@ def ewma(float64_t[:] vals, float64_t com, ndarray[float64_t] output = np.empty(N, dtype=float) float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur Py_ssize_t i, nobs + bint is_observation if N == 0: return output @@ -1797,29 +1809,30 @@ def ewma(float64_t[:] vals, float64_t com, output[0] = weighted_avg if (nobs >= minp) else NaN old_wt = 1. - for i in range(1, N): - cur = vals[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - - if is_observation or (not ignore_na): - - old_wt *= old_wt_factor - if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ((old_wt * weighted_avg) + - (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur + with nogil: + for i in range(1, N): + cur = vals[i] + is_observation = (cur == cur) + nobs += is_observation + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur - output[i] = weighted_avg if (nobs >= minp) else NaN + output[i] = weighted_avg if (nobs >= minp) else NaN return output @@ -1829,7 +1842,7 @@ def ewma(float64_t[:] vals, float64_t com, def ewmcov(float64_t[:] input_x, float64_t[:] input_y, - float64_t com, int adjust, int ignore_na, int minp, int bias): + float64_t com, int adjust, bint ignore_na, int minp, int bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1839,25 +1852,27 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, input_y : ndarray (float64 type) com : float64 adjust: int - ignore_na: int + ignore_na: bool minp: int bias: int Returns ------- - y : ndarray + ndarray """ cdef: Py_ssize_t N = len(input_x) float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + float64_t numerator, denominator Py_ssize_t i, nobs ndarray[float64_t] output + bint is_observation if len(input_y) != N: - raise ValueError("arrays are of different lengths " - "({N} and {len_y})".format(N=N, len_y=len(input_y))) + raise ValueError(f"arrays are of different lengths " + f"({N} and {len(input_y)})") output = np.empty(N, dtype=float) if N == 0: @@ -1882,55 +1897,57 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, sum_wt2 = 1. old_wt = 1. - for i in range(1, N): - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - - # avoid numerical errors on constant series - if mean_x != cur_x: - mean_x = ((old_wt * old_mean_x) + - (new_wt * cur_x)) / (old_wt + new_wt) - - # avoid numerical errors on constant series - if mean_y != cur_y: - mean_y = ((old_wt * old_mean_y) + - (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * - (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * - (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y + with nogil: - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - if (denominator > 0.): - output[i] = ((numerator / denominator) * cov) + for i in range(1, N): + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += is_observation + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN else: - output[i] = NaN + output[i] = cov else: - output[i] = cov - else: - output[i] = NaN + output[i] = NaN return output diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx new file mode 100644 index 0000000000000..2d01d1964c043 --- /dev/null +++ b/pandas/_libs/window/indexers.pyx @@ -0,0 +1,105 @@ +# cython: boundscheck=False, wraparound=False, cdivision=True + +import numpy as np +from numpy cimport ndarray, int64_t + +# Cython routines for window indexers + + +def calculate_variable_window_bounds( + int64_t num_values, + int64_t window_size, + object min_periods, # unused but here to match get_window_bounds signature + object center, # unused but here to match get_window_bounds signature + object closed, + const int64_t[:] index +): + """ + Calculate window boundaries for rolling windows from a time offset. + + Parameters + ---------- + num_values : int64 + total number of values + + window_size : int64 + window size calculated from the offset + + min_periods : object + ignored, exists for compatibility + + center : object + ignored, exists for compatibility + + closed : str + string of side of the window that should be closed + + index : ndarray[int64] + time series index to roll over + + Returns + ------- + (ndarray[int64], ndarray[int64]) + """ + cdef: + bint left_closed = False + bint right_closed = False + ndarray[int64_t, ndim=1] start, end + int64_t start_bound, end_bound + Py_ssize_t i, j + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = 'right' if index is not None else 'both' + + if closed in ['right', 'both']: + right_closed = True + + if closed in ['left', 'both']: + left_closed = True + + start = np.empty(num_values, dtype='int64') + start.fill(-1) + end = np.empty(num_values, dtype='int64') + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, num_values): + end_bound = index[i] + start_bound = index[i] - window_size + + # left endpoint is closed + if left_closed: + start_bound -= 1 + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + return start, end diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 1775893b9f2bf..73201e75c3c88 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -70,7 +70,7 @@ def write_csv_rows(list data, ndarray data_index, @cython.boundscheck(False) @cython.wraparound(False) -def convert_json_to_lines(object arr): +def convert_json_to_lines(arr: object) -> str: """ replace comma separated json with line feeds, paying special attention to quotes & brackets diff --git a/pandas/_testing.py b/pandas/_testing.py new file mode 100644 index 0000000000000..0b81fb0f7a8d5 --- /dev/null +++ b/pandas/_testing.py @@ -0,0 +1,2745 @@ +import bz2 +from collections import Counter +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +import gzip +import os +from shutil import rmtree +import string +import tempfile +from typing import Any, List, Optional, Union, cast +import warnings +import zipfile + +import numpy as np +from numpy.random import rand, randn + +from pandas._config.localization import ( # noqa:F401 + can_set_locale, + get_locales, + set_locale, +) + +import pandas._libs.testing as _testing +from pandas._typing import FilePathOrBuffer, FrameOrSeries +from pandas.compat import _get_lzma_file, _import_lzma + +from pandas.core.dtypes.common import ( + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) +from pandas.core.algorithms import take_1d +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) + +from pandas.io.common import urlopen +from pandas.io.formats.printing import pprint_thing + +lzma = _import_lzma() + +N = 30 +K = 4 +_RAISE_NETWORK_ERROR_DEFAULT = False + +# set testing_mode +_testing_mode_warnings = (DeprecationWarning, ResourceWarning) + + +def set_testing_mode(): + # set the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) + + +def reset_testing_mode(): + # reset the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) + + +set_testing_mode() + + +def reset_display_options(): + """ + Reset the display options for printing and representing objects. + """ + pd.reset_option("^display.", silent=True) + + +def round_trip_pickle( + obj: Any, path: Optional[FilePathOrBuffer] = None +) -> FrameOrSeries: + """ + Pickle an object and then read it again. + + Parameters + ---------- + obj : any object + The object to pickle and then re-read. + path : str, path object or file-like object, default None + The path where the pickled object is written and then read. + + Returns + ------- + pandas object + The original object that was pickled and then re-read. + """ + _path = path + if _path is None: + _path = f"__{rands(10)}__.pickle" + with ensure_clean(_path) as path: + pd.to_pickle(obj, _path) + return pd.read_pickle(_path) + + +def round_trip_pathlib(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + Path = pytest.importorskip("pathlib").Path + if path is None: + path = "___pathlib___" + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a py.path LocalPath and read it back. + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + LocalPath = pytest.importorskip("py.path").local + if path is None: + path = "___localpath___" + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + +@contextmanager +def decompress_file(path, compression): + """ + Open a compressed file and return a file object. + + Parameters + ---------- + path : str + The path where the file is read from. + + compression : {'gzip', 'bz2', 'zip', 'xz', None} + Name of the decompression to use + + Returns + ------- + file object + """ + if compression is None: + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = _get_lzma_file(lzma)(path, "rb") + elif compression == "zip": + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + try: + yield f + finally: + f.close() + if compression == "zip": + zip_file.close() + + +def write_to_compressed(compression, path, data, dest="test"): + """ + Write data to a compressed file. + + Parameters + ---------- + compression : {'gzip', 'bz2', 'zip', 'xz'} + The compression type to use. + path : str + The file path to write the data. + data : str + The data to write. + dest : str, default "test" + The destination file (for ZIP only) + + Raises + ------ + ValueError : An invalid compression value was passed in. + """ + if compression == "zip": + import zipfile + + compress_method = zipfile.ZipFile + elif compression == "gzip": + import gzip + + compress_method = gzip.GzipFile + elif compression == "bz2": + import bz2 + + compress_method = bz2.BZ2File + elif compression == "xz": + compress_method = _get_lzma_file(lzma) + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + if compression == "zip": + mode = "w" + args = (dest, data) + method = "writestr" + else: + mode = "wb" + args = (data,) + method = "write" + + with compress_method(path, mode=mode) as f: + getattr(f, method)(*args) + + +def assert_almost_equal( + left, + right, + check_dtype: Union[bool, str] = "equiv", + check_less_precise: Union[bool, int] = False, + **kwargs, +): + """ + Check that the left and right objects are approximately equal. + + By approximately equal, we refer to objects that are numbers or that + contain numbers which may be equivalent to specific levels of precision. + + Parameters + ---------- + left : object + right : object + check_dtype : bool or {'equiv'}, default 'equiv' + Check dtype if both a and b are the same type. If 'equiv' is passed in, + then `RangeIndex` and `Int64Index` are also considered equivalent + when doing type checking. + check_less_precise : bool or int, default False + Specify comparison precision. 5 digits (False) or 3 digits (True) + after decimal points are compared. If int, then specify the number + of digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + """ + if isinstance(left, pd.Index): + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.Series): + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.DataFrame): + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + else: + # Other sequences. + if check_dtype: + if is_number(left) and is_number(right): + # Do not compare numeric classes, like np.float64 and float. + pass + elif is_bool(left) and is_bool(right): + # Do not compare bool classes, like np.bool_ and bool. + pass + else: + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): + obj = "numpy array" + else: + obj = "Input" + assert_class_equal(left, right, obj=obj) + _testing.assert_almost_equal( + left, + right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + +def _check_isinstance(left, right, cls): + """ + Helper method for our assert_* methods that ensures that + the two objects being compared have the right type before + proceeding with the comparison. + + Parameters + ---------- + left : The first object being compared. + right : The second object being compared. + cls : The class type to check against. + + Raises + ------ + AssertionError : Either `left` or `right` is not an instance of `cls`. + """ + cls_name = cls.__name__ + + if not isinstance(left, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(left)} instead" + ) + if not isinstance(right, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(right)} instead" + ) + + +def assert_dict_equal(left, right, compare_keys: bool = True): + + _check_isinstance(left, right, dict) + _testing.assert_dict_equal(left, right, compare_keys=compare_keys) + + +def randbool(size=(), p: float = 0.5): + return rand(*size) <= p + + +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) + + +def rands_array(nchars, size, dtype="O"): + """ + Generate an array of byte strings. + """ + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def randu_array(nchars, size, dtype="O"): + """ + Generate an array of unicode strings. + """ + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def rands(nchars): + """ + Generate one random byte string. + + See `rands_array` if you want to create an array of random strings. + + """ + return "".join(np.random.choice(RANDS_CHARS, nchars)) + + +def randu(nchars): + """ + Generate one random unicode string. + + See `randu_array` if you want to create an array of random unicode strings. + + """ + return "".join(np.random.choice(RANDU_CHARS, nchars)) + + +def close(fignum=None): + from matplotlib.pyplot import get_fignums, close as _close + + if fignum is None: + for fignum in get_fignums(): + _close(fignum) + else: + _close(fignum) + + +# ----------------------------------------------------------------------------- +# contextmanager to ensure the file cleanup + + +@contextmanager +def ensure_clean(filename=None, return_filelike=False): + """ + Gets a temporary path and agrees to remove on close. + + Parameters + ---------- + filename : str (optional) + if None, creates a temporary file which is then removed when out of + scope. if passed, creates temporary file with filename as ending. + return_filelike : bool (default False) + if True, returns a file-like which is *always* cleaned. Necessary for + savefig and other functions which want to append extensions. + """ + filename = filename or "" + fd = None + + if return_filelike: + f = tempfile.TemporaryFile(suffix=filename) + try: + yield f + finally: + f.close() + else: + # don't generate tempfile if using a path with directory specified + if len(os.path.dirname(filename)): + raise ValueError("Can't pass a qualified name to ensure_clean()") + + try: + fd, filename = tempfile.mkstemp(suffix=filename) + except UnicodeEncodeError: + import pytest + + pytest.skip("no unicode file names on this system") + + try: + yield filename + finally: + try: + os.close(fd) + except OSError: + print(f"Couldn't close file descriptor: {fd} (file: {filename})") + try: + if os.path.exists(filename): + os.remove(filename) + except OSError as e: + print(f"Exception on removing file: {e}") + + +@contextmanager +def ensure_clean_dir(): + """ + Get a temporary directory path and agrees to remove on close. + + Yields + ------ + Temporary directory path + """ + directory_name = tempfile.mkdtemp(suffix="") + try: + yield directory_name + finally: + try: + rmtree(directory_name) + except OSError: + pass + + +@contextmanager +def ensure_safe_environment_variables(): + """ + Get a context manager to safely set environment variables + + All changes will be undone on close, hence environment variables set + within this contextmanager will neither persist nor change global state. + """ + saved_environ = dict(os.environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(saved_environ) + + +# ----------------------------------------------------------------------------- +# Comparators + + +def equalContents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: + """ + Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + check_names : bool, default True + Whether to check the names attribute. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + def _check_types(l, r, obj="Index"): + if exact: + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal("dtype", l, r, obj=obj) + + # allow string-like to have different inferred_types + if l.inferred_type in ("string", "unicode"): + assert r.inferred_type in ("string", "unicode") + else: + assert_attr_equal("inferred_type", l, r, obj=obj) + + def _get_ilevel_values(index, level): + # accept level number only + unique = index.levels[level] + level_codes = index.codes[level] + filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) + values = unique._shallow_copy(filled, name=index.names[level]) + return values + + # instance validation + _check_isinstance(left, right, Index) + + # class / dtype comparison + _check_types(left, right, obj=obj) + + # level comparison + if left.nlevels != right.nlevels: + msg1 = f"{obj} levels are different" + msg2 = f"{left.nlevels}, {left}" + msg3 = f"{right.nlevels}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # length comparison + if len(left) != len(right): + msg1 = f"{obj} length are different" + msg2 = f"{len(left)}, {left}" + msg3 = f"{len(right)}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # MultiIndex special comparison for little-friendly error messages + if left.nlevels > 1: + left = cast(MultiIndex, left) + right = cast(MultiIndex, right) + + for level in range(left.nlevels): + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + lobj = f"MultiIndex level [{level}]" + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) + # get_level_values may change dtype + _check_types(left.levels[level], right.levels[level], obj=obj) + + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: + if not left.equals(right): + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + else: + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) + + # metadata comparison + if check_names: + assert_attr_equal("names", left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): + assert_interval_array_equal(left.values, right.values) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): + """ + Checks classes are equal. + """ + __tracebackhide__ = True + + def repr_class(x): + if isinstance(x, Index): + # return Index as it is to include values in the error message + return x + + try: + return type(x).__name__ + except AttributeError: + return repr(type(x)) + + if exact == "equiv": + if type(left) != type(right): + # allow equivalence of Int64Index/RangeIndex + types = {type(left).__name__, type(right).__name__} + if len(types - {"Int64Index", "RangeIndex"}): + msg = f"{obj} classes are not equivalent" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + elif exact: + if type(left) != type(right): + msg = f"{obj} classes are different" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + + +def assert_attr_equal(attr, left, right, obj="Attributes"): + """checks attributes are equal. Both objects must have attribute. + + Parameters + ---------- + attr : str + Attribute name being compared. + left : object + right : object + obj : str, default 'Attributes' + Specify object name being compared, internally used to show appropriate + assertion message + """ + __tracebackhide__ = True + + left_attr = getattr(left, attr) + right_attr = getattr(right, attr) + + if left_attr is right_attr: + return True + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): + # np.nan + return True + + try: + result = left_attr == right_attr + except TypeError: + # datetimetz on rhs may raise TypeError + result = False + if not isinstance(result, bool): + result = result.all() + + if result: + return True + else: + msg = f'Attribute "{attr}" are different' + raise_assert_detail(obj, msg, left_attr, right_attr) + + +def assert_is_valid_plot_return_object(objs): + import matplotlib.pyplot as plt + + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {repr(type(el).__name__)}" + ) + assert isinstance(el, (plt.Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{repr(type(objs).__name__)}" + ) + assert isinstance(objs, (plt.Artist, tuple, dict)), msg + + +def isiterable(obj): + return hasattr(obj, "__iter__") + + +def assert_is_sorted(seq): + """Assert that the sequence is sorted.""" + if isinstance(seq, (Index, Series)): + seq = seq.values + # sorting does not change precisions + assert_numpy_array_equal(seq, np.sort(np.array(seq))) + + +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): + """Test that Categoricals are equivalent. + + Parameters + ---------- + left : Categorical + right : Categorical + check_dtype : bool, default True + Check that integer dtype of the codes are the same + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, Categorical) + + if check_category_order: + assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") + assert_numpy_array_equal( + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + ) + else: + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj=f"{obj}.categories", + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj=f"{obj}.values", + ) + + assert_attr_equal("ordered", left, right, obj=obj) + + +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'IntervalArray' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") + assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + assert_attr_equal("closed", left, right, obj=obj) + + +def assert_period_array_equal(left, right, obj="PeriodArray"): + _check_isinstance(left, right, PeriodArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") + assert_attr_equal("freq", left, right, obj=obj) + + +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): + __tracebackhide__ = True + _check_isinstance(left, right, DatetimeArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) + + +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): + __tracebackhide__ = True + _check_isinstance(left, right, TimedeltaArray) + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + + +def raise_assert_detail(obj, message, left, right, diff=None): + __tracebackhide__ = True + + if isinstance(left, np.ndarray): + left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) + + if isinstance(right, np.ndarray): + right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) + + msg = f"""{obj} are different + +{message} +[left]: {left} +[right]: {right}""" + + if diff is not None: + msg += f"\n[diff]: {diff}" + + raise AssertionError(msg) + + +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): + """ + Check that 'np.ndarray' is equivalent. + + Parameters + ---------- + left, right : numpy.ndarray or iterable + The two arrays to be compared. + strict_nan : bool, default False + If True, consider NaN and None to be different. + check_dtype : bool, default True + Check dtype if both a and b are np.ndarray. + err_msg : str, default None + If provided, used as assertion message. + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area. + obj : str, default 'numpy array' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + # Show a detailed error message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + _check_isinstance(left, right, np.ndarray) + + def _get_base(obj): + return obj.base if getattr(obj, "base", None) is not None else obj + + left_base = _get_base(left) + right_base = _get_base(right) + + if check_same == "same": + if left_base is not right_base: + raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") + elif check_same == "copy": + if left_base is right_base: + raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") + + def _raise(left, right, err_msg): + if err_msg is None: + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shapes are different", left.shape, right.shape, + ) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + + raise AssertionError(err_msg) + + # compare shape and values + if not array_equivalent(left, right, strict_nan=strict_nan): + _raise(left, right, err_msg) + + if check_dtype: + if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): + assert_attr_equal("dtype", left, right, obj=obj) + + +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): + """Check that left and right ExtensionArrays are equal. + + Parameters + ---------- + left, right : ExtensionArray + The two arrays to compare + check_dtype : bool, default True + Whether to check if the ExtensionArray dtypes are identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default False + Whether to compare number exactly. + + Notes + ----- + Missing values are checked separately from valid values. + A mask of missing values is computed for each and checked to match. + The remaining all-valid values are cast to object dtype and checked. + """ + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" + if check_dtype: + assert_attr_equal("dtype", left, right, obj="ExtensionArray") + + if hasattr(left, "asi8") and type(right) == type(left): + # Avoid slow object-dtype comparisons + assert_numpy_array_equal(left.asi8, right.asi8) + return + + left_na = np.asarray(left.isna()) + right_na = np.asarray(right.isna()) + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") + + left_valid = np.asarray(left[~left_na].astype(object)) + right_valid = np.asarray(right[~right_na].astype(object)) + if check_exact: + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") + else: + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) + + +# This could be refactored to use the NDFrame.equals method +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): + """ + Check that left and right Series are equal. + + Parameters + ---------- + left : Series + right : Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_series_type : bool, default True + Whether to check the Series class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, Series) + + if check_series_type: + # ToDo: There are some tests using rhs is sparse + # lhs is dense. Should use assert_class_equal in future + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # length comparison + if len(left) != len(right): + msg1 = f"{len(left)}, {left.index}" + msg2 = f"{len(right)}, {right.index}" + raise_assert_detail(obj, "Series length are different", msg1, msg2) + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + if check_dtype: + # We want to skip exact dtype checking when `check_categorical` + # is False. We'll still raise if only one is a `Categorical`, + # regardless of `check_categorical` + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): + pass + else: + assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") + + if check_exact: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj=str(obj), + ) + elif check_datetimelike_compat: + # we want to check only if we have compat dtypes + # e.g. integer and M|m are NOT compat, but we can simply check + # the values in that case + if needs_i8_conversion(left) or needs_i8_conversion(right): + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left.values).equals(Index(right.values)): + msg = ( + f"[datetimelike_compat=True] {left.values} " + f"is not equal to {right.values}." + ) + raise AssertionError(msg) + else: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) + elif is_interval_dtype(left) or is_interval_dtype(right): + assert_interval_array_equal(left.array, right.array) + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): + # .values is an ndarray, but ._values is the ExtensionArray. + # TODO: Use .array + assert is_extension_array_dtype(right.dtype) + assert_extension_array_equal(left._values, right._values) + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): + assert_extension_array_equal(left.array, right.array) + else: + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + + # metadata comparison + if check_names: + assert_attr_equal("name", left, right, obj=obj) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +# This could be refactored to use the NDFrame.equals method +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): + """ + Check that left and right DataFrame are equal. + + This function is intended to compare two DataFrames and output any + differences. Is is mostly intended for use in unit tests. + Additional parameters allow varying the strictness of the + equality checks performed. + + Parameters + ---------- + left : DataFrame + First DataFrame to compare. + right : DataFrame + Second DataFrame to compare. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_series_equal : Equivalent method for asserting Series equality. + DataFrame.equals : Check DataFrame equality. + + Examples + -------- + This example shows comparing two DataFrames that are equal + but with columns of differing dtypes. + + >>> from pandas._testing import assert_frame_equal + >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + + df1 equals itself. + + >>> assert_frame_equal(df1, df1) + + df1 differs from df2 as column 'b' is of a different type. + + >>> assert_frame_equal(df1, df2) + Traceback (most recent call last): + ... + AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different + + Attribute "dtype" are different + [left]: int64 + [right]: float64 + + Ignore differing dtypes in columns with check_dtype. + + >>> assert_frame_equal(df1, df2, check_dtype=False) + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, DataFrame) + + if check_frame_type: + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # shape comparison + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + ) + + if check_like: + left, right = left.reindex_like(right), right + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + # column comparison + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.columns", + ) + + # compare by blocks + if by_blocks: + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() + for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): + assert dtype in lblocks + assert dtype in rblocks + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) + + # compare by columns + else: + for i, col in enumerate(left.columns): + assert col in right + lcol = left.iloc[:, i] + rcol = right.iloc[:, i] + assert_series_equal( + lcol, + rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_names=check_names, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + obj=f'{obj}.iloc[:, {i}] (column name="{col}")', + ) + + +def assert_equal(left, right, **kwargs): + """ + Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. + + Parameters + ---------- + left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray + The two items to be compared. + **kwargs + All keyword arguments are passed through to the underlying assert method. + """ + __tracebackhide__ = True + + if isinstance(left, pd.Index): + assert_index_equal(left, right, **kwargs) + elif isinstance(left, pd.Series): + assert_series_equal(left, right, **kwargs) + elif isinstance(left, pd.DataFrame): + assert_frame_equal(left, right, **kwargs) + elif isinstance(left, IntervalArray): + assert_interval_array_equal(left, right, **kwargs) + elif isinstance(left, PeriodArray): + assert_period_array_equal(left, right, **kwargs) + elif isinstance(left, DatetimeArray): + assert_datetime_array_equal(left, right, **kwargs) + elif isinstance(left, TimedeltaArray): + assert_timedelta_array_equal(left, right, **kwargs) + elif isinstance(left, ExtensionArray): + assert_extension_array_equal(left, right, **kwargs) + elif isinstance(left, np.ndarray): + assert_numpy_array_equal(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + assert left == right + else: + raise NotImplementedError(type(left)) + + +def box_expected(expected, box_cls, transpose=True): + """ + Helper function to wrap the expected output of a test in a given box_class. + + Parameters + ---------- + expected : np.ndarray, Index, Series + box_cls : {Index, Series, DataFrame} + + Returns + ------- + subclass of box_cls + """ + if box_cls is pd.Index: + expected = pd.Index(expected) + elif box_cls is pd.Series: + expected = pd.Series(expected) + elif box_cls is pd.DataFrame: + expected = pd.Series(expected).to_frame() + if transpose: + # for vector operations, we we need a DataFrame to be a single-row, + # not a single-column, in order to operate against non-DataFrame + # vectors of the same length. + expected = expected.T + elif box_cls is PeriodArray: + # the PeriodArray constructor is not as flexible as period_array + expected = period_array(expected) + elif box_cls is DatetimeArray: + expected = DatetimeArray(expected) + elif box_cls is TimedeltaArray: + expected = TimedeltaArray(expected) + elif box_cls is np.ndarray: + expected = np.array(expected) + elif box_cls is to_array: + expected = to_array(expected) + else: + raise NotImplementedError(box_cls) + return expected + + +def to_array(obj): + # temporary implementation until we get pd.array in place + if is_period_dtype(obj): + return period_array(obj) + elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): + return DatetimeArray._from_sequence(obj) + elif is_timedelta64_dtype(obj): + return TimedeltaArray._from_sequence(obj) + else: + return np.array(obj) + + +# ----------------------------------------------------------------------------- +# Sparse + + +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + check_kind : bool, default True + Whether to just the kind of the sparse index for each column. + check_fill_value : bool, default True + Whether to check that left.fill_value matches right.fill_value + consolidate_block_indices : bool, default False + Whether to consolidate contiguous blocks for sparse arrays with + a BlockIndex. Some operations, e.g. concat, will end up with + block indices that could be consolidated. Setting this to true will + create a new BlockIndex for that array, with consolidated + block indices. + """ + + _check_isinstance(left, right, pd.arrays.SparseArray) + + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + + # SparseIndex comparison + assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) + assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) + + if not check_kind: + left_index = left.sp_index.to_block_index() + right_index = right.sp_index.to_block_index() + else: + left_index = left.sp_index + right_index = right.sp_index + + if consolidate_block_indices and left.kind == "block": + # we'll probably remove this hack... + left_index = left_index.to_int_index().to_block_index() + right_index = right_index.to_int_index().to_block_index() + + if not left_index.equals(right_index): + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) + else: + # Just ensure a + pass + + if check_fill_value: + assert_attr_equal("fill_value", left, right) + if check_dtype: + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +# ----------------------------------------------------------------------------- +# Others + + +def assert_contains_all(iterable, dic): + for k in iterable: + assert k in dic, f"Did not contain item: {repr(k)}" + + +def assert_copy(iter1, iter2, **eql_kwargs): + """ + iter1, iter2: iterables that produce elements + comparable with assert_almost_equal + + Checks that the elements are equal, but not + the same object. (Does not check that items + in sequences are also not the same object) + """ + for elem1, elem2 in zip(iter1, iter2): + assert_almost_equal(elem1, elem2, **eql_kwargs) + msg = ( + f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " + "different objects, but they were the same object." + ) + assert elem1 is not elem2, msg + + +def getCols(k): + return string.ascii_uppercase[:k] + + +# make index +def makeStringIndex(k=10, name=None): + return Index(rands_array(nchars=10, size=k), name=name) + + +def makeUnicodeIndex(k=10, name=None): + return Index(randu_array(nchars=10, size=k), name=name) + + +def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) + + +def makeIntervalIndex(k=10, name=None, **kwargs): + """ make a length k IntervalIndex """ + x = np.linspace(0, 100, num=(k + 1)) + return IntervalIndex.from_breaks(x, name=name, **kwargs) + + +def makeBoolIndex(k=10, name=None): + if k == 1: + return Index([True], name=name) + elif k == 2: + return Index([False, True], name=name) + return Index([False, True] + [False] * (k - 2), name=name) + + +def makeIntIndex(k=10, name=None): + return Index(list(range(k)), name=name) + + +def makeUIntIndex(k=10, name=None): + return Index([2 ** 63 + i for i in range(k)], name=name) + + +def makeRangeIndex(k=10, name=None, **kwargs): + return RangeIndex(0, k, 1, name=name, **kwargs) + + +def makeFloatIndex(k=10, name=None): + values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) + return Index(values * (10 ** np.random.randint(0, 9)), name=name) + + +def makeDateIndex(k=10, freq="B", name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = bdate_range(dt, periods=k, freq=freq, name=name) + return DatetimeIndex(dr, name=name, **kwargs) + + +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) + + +def makePeriodIndex(k=10, name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + return dr + + +def makeMultiIndex(k=10, names=None, **kwargs): + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) + + +_names = [ + "Alice", + "Bob", + "Charlie", + "Dan", + "Edith", + "Frank", + "George", + "Hannah", + "Ingrid", + "Jerry", + "Kevin", + "Laura", + "Michael", + "Norbert", + "Oliver", + "Patricia", + "Quinn", + "Ray", + "Sarah", + "Tim", + "Ursula", + "Victor", + "Wendy", + "Xavier", + "Yvonne", + "Zelda", +] + + +def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): + """ + Make a DataFrame with a DatetimeIndex + + Parameters + ---------- + start : str or Timestamp, default "2000-01-01" + The start of the index. Passed to date_range with `freq`. + end : str or Timestamp, default "2000-12-31" + The end of the index. Passed to date_range with `freq`. + freq : str or Freq + The frequency to use for the DatetimeIndex + seed : int, optional + The random state seed. + + * name : object dtype with string names + * id : int dtype with + * x, y : float dtype + + Examples + -------- + >>> _make_timeseries() + id name x y + timestamp + 2000-01-01 982 Frank 0.031261 0.986727 + 2000-01-02 1025 Edith -0.086358 -0.032920 + 2000-01-03 982 Edith 0.473177 0.298654 + 2000-01-04 1009 Sarah 0.534344 -0.750377 + 2000-01-05 963 Zelda -0.271573 0.054424 + ... ... ... ... ... + 2000-12-27 980 Ingrid -0.132333 -0.422195 + 2000-12-28 972 Frank -0.376007 -0.298687 + 2000-12-29 1009 Ursula -0.865047 -0.503133 + 2000-12-30 1000 Hannah -0.063757 -0.507336 + 2000-12-31 972 Tim -0.869120 0.531685 + """ + index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") + n = len(index) + state = np.random.RandomState(seed) + columns = { + "name": state.choice(_names, size=n), + "id": state.poisson(1000, size=n), + "x": state.rand(n) * 2 - 1, + "y": state.rand(n) * 2 - 1, + } + df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + if df.index[-1] == end: + df = df.iloc[:-1] + return df + + +def all_index_generator(k=10): + """Generator which can be iterated over to get instances of all the various + index classes. + + Parameters + ---------- + k: length of each of the index instances + """ + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] + for make_index_func in all_make_index_funcs: + yield make_index_func(k=k) + + +def index_subclass_makers_generator(): + make_index_funcs = [ + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, + ] + for make_index_func in make_index_funcs: + yield make_index_func + + +def all_timeseries_index_generator(k=10): + """Generator which can be iterated over to get instances of all the classes + which represent time-series. + + Parameters + ---------- + k: length of each of the index instances + """ + make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] + for make_index_func in make_index_funcs: + yield make_index_func(k=k) + + +# make series +def makeFloatSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeStringSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeObjectSeries(name=None): + data = makeStringIndex(N) + data = Index(data, dtype=object) + index = makeStringIndex(N) + return Series(data, index=index, name=name) + + +def getSeriesData(): + index = makeStringIndex(N) + return {c: Series(randn(N), index=index) for c in getCols(K)} + + +def makeTimeSeries(nper=None, freq="B", name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) + + +def makePeriodSeries(nper=None, name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makePeriodIndex(nper), name=name) + + +def getTimeSeriesData(nper=None, freq="B"): + return {c: makeTimeSeries(nper, freq) for c in getCols(K)} + + +def getPeriodData(nper=None): + return {c: makePeriodSeries(nper) for c in getCols(K)} + + +# make frame +def makeTimeDataFrame(nper=None, freq="B"): + data = getTimeSeriesData(nper, freq) + return DataFrame(data) + + +def makeDataFrame(): + data = getSeriesData() + return DataFrame(data) + + +def getMixedTypeDict(): + index = Index(["a", "b", "c", "d", "e"]) + + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + + return index, data + + +def makeMixedDataFrame(): + return DataFrame(getMixedTypeDict()[1]) + + +def makePeriodFrame(nper=None): + data = getPeriodData(nper) + return DataFrame(data) + + +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): + """Create an index/multindex with given dimensions, levels, names, etc' + + nentries - number of entries in index + nlevels - number of levels (> 1 produces multindex) + prefix - a string prefix for labels + names - (Optional), bool or list of strings. if True will use default + names, if false will use no names, if a list is given, the name of + each level in the index will be taken from the list. + ndupe_l - (Optional), list of ints, the number of rows for which the + label will repeated at the corresponding level, you can specify just + the first few, the rest will use the default ndupe_l of 1. + len(ndupe_l) <= nlevels. + idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a datetime index. + + if unspecified, string labels will be generated. + """ + + if ndupe_l is None: + ndupe_l = [1] * nlevels + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) + + if names is True: + # build default names + names = [prefix + str(i) for i in range(nlevels)] + if names is False: + # pass None to index constructor for no name + names = None + + # make singleton case uniform + if isinstance(names, str) and nlevels == 1: + names = [names] + + # specific 1D index type requested? + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) + if idx_func: + idx = idx_func(nentries) + # but we need to fill in the name + if names: + idx.name = names[0] + return idx + elif idx_type is not None: + raise ValueError( + f"{repr(idx_type)} is not a legal value for `idx_type`, " + "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." + ) + + if len(ndupe_l) < nlevels: + ndupe_l.extend([1] * (nlevels - len(ndupe_l))) + assert len(ndupe_l) == nlevels + + assert all(x > 0 for x in ndupe_l) + + tuples = [] + for i in range(nlevels): + + def keyfunc(x): + import re + + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") + return [int(num) for num in numeric_tuple] + + # build a list of lists to create the index from + div_factor = nentries // ndupe_l[i] + 1 + cnt = Counter() + for j in range(div_factor): + label = f"{prefix}_l{i}_g{j}" + cnt[label] = ndupe_l[i] + # cute Counter trick + result = sorted(cnt.elements(), key=keyfunc)[:nentries] + tuples.append(result) + + tuples = list(zip(*tuples)) + + # convert tuples to index + if nentries == 1: + # we have a single level of tuples, i.e. a regular Index + index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) + else: + index = MultiIndex.from_tuples(tuples, names=names) + return index + + +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + nrows, ncols - number of data rows/cols + c_idx_names, idx_names - False/True/list of strings, yields No names , + default names or uses the provided names for the levels of the + corresponding index. You can provide a single string when + c_idx_nlevels ==1. + c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex + r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex + data_gen_f - a function f(row,col) which return the data value + at that position, the default generator used yields values of the form + "RxCy" based on position. + c_ndupe_l, r_ndupe_l - list of integers, determines the number + of duplicates for each label at a given level of the corresponding + index. The default `None` value produces a multiplicity of 1 across + all levels, i.e. a unique index. Will accept a partial list of length + N < idx_nlevels, for just the first N levels. If ndupe doesn't divide + nrows/ncol, the last label might have lower multiplicity. + dtype - passed to the DataFrame constructor as is, in case you wish to + have more control in conjunction with a custom `data_gen_f` + r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a timedelta index. + + if unspecified, string labels will be generated. + + Examples: + + # 5 row, 3 columns, default names on both, single index on both axis + >> makeCustomDataframe(5,3) + + # make the data a random int between 1 and 100 + >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) + + # 2-level multiindex on rows with each label duplicated + # twice on first level, default names on both axis, single + # index on both axis + >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) + + # DatetimeIndex on row, index with unicode labels on columns + # no names on either axis + >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, + r_idx_type="dt",c_idx_type="u") + + # 4-level multindex on rows with names provided, 2-level multindex + # on columns with default labels and default names. + >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, + r_idx_names=["FEE","FI","FO","FAM"], + c_idx_nlevels=2) + + >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + """ + + assert c_idx_nlevels > 0 + assert r_idx_nlevels > 0 + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) + + # by default, generate data based on location + if data_gen_f is None: + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] + + return DataFrame(data, index, columns, dtype=dtype) + + +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1.0 / nrows).astype(int) + i = (ind - j * nrows).astype(int) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.values[i, j] = np.nan + return df + + +def makeMissingDataframe(density=0.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) + df.values[i, j] = np.nan + return df + + +def optional_args(decorator): + """allows a decorator to take optional positional and keyword arguments. + Assumes that taking a single, callable, positional argument means that + it is decorating a function, i.e. something like this:: + + @my_decorator + def function(): pass + + Calls decorator with decorator(f, *args, **kwargs)""" + + @wraps(decorator) + def wrapper(*args, **kwargs): + def dec(f): + return decorator(f, *args, **kwargs) + + is_decorating = not kwargs and len(args) == 1 and callable(args[0]) + if is_decorating: + f = args[0] + args = [] + return dec(f) + else: + return dec + + return wrapper + + +# skip tests on exceptions with this message +_network_error_messages = ( + # 'urlopen error timed out', + # 'timeout: timed out', + # 'socket.timeout: timed out', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", +) + +# or this e.errno/e.reason.errno +_network_errno_vals = ( + 101, # Network is unreachable + 111, # Connection refused + 110, # Connection timed out + 104, # Connection reset Error + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out +) + +# Both of the above shouldn't mask real issues such as 404's +# or refused connections (changed DNS). +# But some tests (test_data yahoo) contact incredibly flakey +# servers. + +# and conditionally raise on exception types in _get_default_network_errors + + +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): + """Try to connect to the given url. True if succeeds, False if IOError + raised + + Parameters + ---------- + url : basestring + The URL to try to connect to + + Returns + ------- + connectable : bool + Return True if no IOError (unable to connect) or URLError (bad url) was + raised + """ + + if error_classes is None: + error_classes = _get_default_network_errors() + + try: + with urlopen(url): + pass + except error_classes: + return False + else: + return True + + +@optional_args +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=None, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): + """ + Label a test as requiring network connection and, if an error is + encountered, only raise if it does not find a network connection. + + In comparison to ``network``, this assumes an added contract to your test: + you must assert that, under normal conditions, your test will ONLY fail if + it does not have network connectivity. + + You can call this in 3 ways: as a standard decorator, with keyword + arguments, or with a positional argument that is the url to check. + + Parameters + ---------- + t : callable + The test requiring network connectivity. + url : path + The url to test via ``pandas.io.common.urlopen`` to check + for connectivity. Defaults to 'http://www.google.com'. + raise_on_error : bool + If True, never catches errors. + check_before_test : bool + If True, checks connectivity before running the test case. + error_classes : tuple or Exception + error classes to ignore. If not in ``error_classes``, raises the error. + defaults to IOError. Be careful about changing the error classes here. + skip_errnos : iterable of int + Any exception that has .errno or .reason.erno set to one + of these values will be skipped with an appropriate + message. + _skip_on_messages: iterable of string + any exception e for which one of the strings is + a substring of str(e) will be skipped with an appropriate + message. Intended to suppress errors where an errno isn't available. + + Notes + ----- + * ``raise_on_error`` supercedes ``check_before_test`` + + Returns + ------- + t : callable + The decorated test ``t``, with checks for connectivity errors. + + Example + ------- + + Tests decorated with @network will fail if it's possible to make a network + connection to another URL (defaults to google.com):: + + >>> from pandas._testing import network + >>> from pandas.io.common import urlopen + >>> @network + ... def test_network(): + ... with urlopen("rabbit://bonanza.com"): + ... pass + Traceback + ... + URLError: + + You can specify alternative URLs:: + + >>> @network("http://www.yahoo.com") + ... def test_something_with_yahoo(): + ... raise IOError("Failure Message") + >>> test_something_with_yahoo() + Traceback (most recent call last): + ... + IOError: Failure Message + + If you set check_before_test, it will check the url first and not run the + test on failure:: + + >>> @network("failing://url.blaher", check_before_test=True) + ... def test_something(): + ... print("I ran!") + ... raise ValueError("Failure") + >>> test_something() + Traceback (most recent call last): + ... + + Errors not related to networking will always be raised. + """ + from pytest import skip + + if error_classes is None: + error_classes = _get_default_network_errors() + + t.network = True + + @wraps(t) + def wrapper(*args, **kwargs): + if check_before_test and not raise_on_error: + if not can_connect(url, error_classes): + skip() + try: + return t(*args, **kwargs) + except Exception as err: + errno = getattr(err, "errno", None) + if not errno and hasattr(errno, "reason"): + errno = getattr(err.reason, "errno", None) + + if errno in skip_errnos: + skip(f"Skipping test due to known errno and error {err}") + + e_str = str(err) + + if any(m.lower() in e_str.lower() for m in _skip_on_messages): + skip( + f"Skipping test because exception message is known and error {err}" + ) + + if not isinstance(err, error_classes): + raise + + if raise_on_error or can_connect(url, error_classes): + raise + else: + skip(f"Skipping test due to lack of connectivity and error {err}") + + return wrapper + + +with_connectivity_check = network + + +@contextmanager +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): + """ + Context manager for running code expected to either raise a specific + warning, or not raise any warnings. Verifies that the code raises the + expected warning, and that it does not raise any other unexpected + warnings. It is basically a wrapper around ``warnings.catch_warnings``. + + Parameters + ---------- + expected_warning : {Warning, False, None}, default Warning + The type of Exception raised. ``exception.Warning`` is the base + class for all warnings. To check that no warning is returned, + specify ``False`` or ``None``. + filter_level : str or None, default "always" + Specifies whether warnings are ignored, displayed, or turned + into errors. + Valid values are: + + * "error" - turns matching warnings into exceptions + * "ignore" - discard the warning + * "always" - always emit a warning + * "default" - print the warning the first time it is generated + from each location + * "module" - print the warning the first time it is generated + from each module + * "once" - print the warning the first time it is generated + + clear : str, default None + If not ``None`` then remove any previously raised warnings from + the ``__warningsregistry__`` to ensure that no warning messages are + suppressed by this context manager. If ``None`` is specified, + the ``__warningsregistry__`` keeps track of which warnings have been + shown, and does not show them again. + check_stacklevel : bool, default True + If True, displays the line that called the function containing + the warning to show were the function is called. Otherwise, the + line that implements the function is displayed. + raise_on_extra_warnings : bool, default True + Whether extra warnings not of the type `expected_warning` should + cause the test to fail. + + Examples + -------- + >>> import warnings + >>> with assert_produces_warning(): + ... warnings.warn(UserWarning()) + ... + >>> with assert_produces_warning(False): + ... warnings.warn(RuntimeWarning()) + ... + Traceback (most recent call last): + ... + AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. + >>> with assert_produces_warning(UserWarning): + ... warnings.warn(RuntimeWarning()) + Traceback (most recent call last): + ... + AssertionError: Did not see expected warning of class 'UserWarning'. + + ..warn:: This is *not* thread-safe. + """ + __tracebackhide__ = True + + with warnings.catch_warnings(record=True) as w: + + if clear is not None: + # make sure that we are clearing these warnings + # if they have happened before + # to guarantee that we will catch them + if not is_list_like(clear): + clear = [clear] + for m in clear: + try: + m.__warningregistry__.clear() + except AttributeError: + # module may not have __warningregistry__ + pass + + saw_warning = False + warnings.simplefilter(filter_level) + yield w + extra_warnings = [] + + for actual_warning in w: + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): + saw_warning = True + + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[2][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + else: + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) + if expected_warning: + msg = ( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + assert saw_warning, msg + if raise_on_extra_warnings and extra_warnings: + raise AssertionError( + f"Caused unexpected warning(s): {repr(extra_warnings)}" + ) + + +class RNGContext: + """ + Context manager to set the numpy random number generator speed. Returns + to the original value upon exiting the context manager. + + Parameters + ---------- + seed : int + Seed for numpy.random.seed + + Examples + -------- + + with RNGContext(42): + np.random.randn() + """ + + def __init__(self, seed): + self.seed = seed + + def __enter__(self): + + self.start_state = np.random.get_state() + np.random.seed(self.seed) + + def __exit__(self, exc_type, exc_value, traceback): + + np.random.set_state(self.start_state) + + +@contextmanager +def with_csv_dialect(name, **kwargs): + """ + Context manager to temporarily register a CSV dialect for parsing CSV. + + Parameters + ---------- + name : str + The name of the dialect. + kwargs : mapping + The parameters for the dialect. + + Raises + ------ + ValueError : the name of the dialect conflicts with a builtin one. + + See Also + -------- + csv : Python's CSV library. + """ + import csv + + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} + + if name in _BUILTIN_DIALECTS: + raise ValueError("Cannot override builtin dialect.") + + csv.register_dialect(name, **kwargs) + yield + csv.unregister_dialect(name) + + +@contextmanager +def use_numexpr(use, min_elements=None): + from pandas.core.computation import expressions as expr + + if min_elements is None: + min_elements = expr._MIN_ELEMENTS + + olduse = expr._USE_NUMEXPR + oldmin = expr._MIN_ELEMENTS + expr.set_use_numexpr(use) + expr._MIN_ELEMENTS = min_elements + yield + expr._MIN_ELEMENTS = oldmin + expr.set_use_numexpr(olduse) + + +def test_parallel(num_threads=2, kwargs_list=None): + """Decorator to run the same function multiple times in parallel. + + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + import threading + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper + + +class SubclassedSeries(Series): + _metadata = ["testattr", "name"] + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + +class SubclassedDataFrame(DataFrame): + _metadata = ["testattr"] + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + +class SubclassedCategorical(Categorical): + @property + def _constructor(self): + return SubclassedCategorical + + +@contextmanager +def set_timezone(tz: str): + """ + Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() + + orig_tz = os.environ.get("TZ") + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) + + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """ + Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + function + """ + if skipna_alternative: + + def skipna_wrapper(x): + return skipna_alternative(x.values) + + else: + + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper + + +def convert_rows_list_to_csv_str(rows_list: List[str]): + """ + Convert list of CSV rows to single CSV-formatted string for current OS. + + This method is used for creating expected value of to_csv() method. + + Parameters + ---------- + rows_list : List[str] + Each element represents the row of csv. + + Returns + ------- + str + Expected output of to_csv() in current OS. + """ + sep = os.linesep + expected = sep.join(rows_list) + sep + return expected diff --git a/pandas/_typing.py b/pandas/_typing.py index 445eff9e19e47..171b76b4d2c4b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -2,10 +2,14 @@ from typing import ( IO, TYPE_CHECKING, + Any, AnyStr, + Callable, + Collection, Dict, - Iterable, + Hashable, List, + Mapping, Optional, TypeVar, Union, @@ -21,24 +25,49 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.series import Series # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 + from pandas.core.series import Series # noqa: F401 + from pandas.core.frame import DataFrame # noqa: F401 +# array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) + +# scalars + +PythonScalar = Union[str, int, float, bool] DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") +PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar = Union[PythonScalar, PandasScalar] + +# other + Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] +# FrameOrSeriesUnion means either a DataFrame or a Series. E.g. +# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series +# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed +# in, either a DataFrame or a Series is returned. +FrameOrSeriesUnion = Union["DataFrame", "Series"] + +# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is +# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a +# Series is passed into a function, a Series is always returned and if a DataFrame is +# passed in, a DataFrame is always returned. FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") -Scalar = Union[str, int, float, bool] + Axis = Union[str, int] +Label = Optional[Hashable] +Level = Union[Label, int] Ordered = Optional[bool] -JSONSerializable = Union[Scalar, List, Dict] +JSONSerializable = Union[PythonScalar, List, Dict] +Axes = Collection -# use Collection after we drop support for py35 -Axes = Iterable +# For functions like rename that convert one label to another +Renamer = Union[Mapping[Label, Any], Callable[[Label], Label]] # to maintain type information across generic functions and parametrization -_T = TypeVar("_T") +T = TypeVar("T") diff --git a/pandas/_version.py b/pandas/_version.py index 0cdedf3da3ea7..66e756a4744c8 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -47,7 +47,7 @@ class NotThisMethod(Exception): pass -HANDLERS = {} # type: Dict[str, Dict[str, Callable]] +HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator @@ -79,17 +79,17 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): if e.errno == errno.ENOENT: continue if verbose: - print("unable to run {dispcmd}".format(dispcmd=dispcmd)) + print(f"unable to run {dispcmd}") print(e) return None else: if verbose: - print("unable to find command, tried %s" % (commands,)) + print(f"unable to find command, tried {commands}") return None stdout = p.communicate()[0].strip().decode() if p.returncode != 0: if verbose: - print("unable to run {dispcmd} (error)".format(dispcmd=dispcmd)) + print(f"unable to run {dispcmd} (error)") return None return stdout @@ -101,10 +101,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): if not dirname.startswith(parentdir_prefix): if verbose: print( - "guessing rootdir is '{root}', but '{dirname}' " - "doesn't start with prefix '{parentdir_prefix}'".format( - root=root, dirname=dirname, parentdir_prefix=parentdir_prefix - ) + f"guessing rootdir is '{root}', but '{dirname}' " + f"doesn't start with prefix '{parentdir_prefix}'" ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") return { @@ -163,15 +161,15 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print("discarding '{}', no digits".format(",".join(refs - tags))) + print(f"discarding '{','.join(refs - tags)}', no digits") if verbose: - print("likely tags: {}".format(",".join(sorted(tags)))) + print(f"likely tags: {','.join(sorted(tags))}") for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: - print("picking {r}".format(r=r)) + print(f"picking {r}") return { "version": r, "full-revisionid": keywords["full"].strip(), @@ -198,7 +196,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if not os.path.exists(os.path.join(root, ".git")): if verbose: - print("no .git in {root}".format(root=root)) + print(f"no .git in {root}") raise NotThisMethod("no .git directory") GITS = ["git"] @@ -240,17 +238,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ( - "unable to parse git-describe output: " - "'{describe_out}'".format(describe_out=describe_out) - ) + pieces["error"] = f"unable to parse git-describe output: '{describe_out}'" return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - fmt = "tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" - msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) + msg = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" if verbose: print(msg) pieces["error"] = msg @@ -291,12 +285,12 @@ def render_pep440(pieces): rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) - rendered += "{:d}.g{}".format(pieces["distance"], pieces["short"]) + rendered += f"{pieces['distance']:d}.g{pieces['short']}" if pieces["dirty"]: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], pieces["short"]) + rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" if pieces["dirty"]: rendered += ".dirty" return rendered @@ -311,10 +305,10 @@ def render_pep440_pre(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] + rendered += f".post.dev{pieces['distance']:d}" else: # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] + rendered = f"0.post.dev{pieces['distance']:d}" return rendered @@ -330,17 +324,17 @@ def render_pep440_post(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post{:d}".format(pieces["distance"]) + rendered += f".post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) - rendered += "g{}".format(pieces["short"]) + rendered += f"g{pieces['short']}" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] + rendered = f"0.pos{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" - rendered += "+g{}".format(pieces["short"]) + rendered += f"+g{pieces['short']}" return rendered @@ -353,12 +347,12 @@ def render_pep440_old(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] + rendered += f".post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] + rendered = f"0.post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" return rendered @@ -374,7 +368,7 @@ def render_git_describe(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += "-{:d}-g{}".format(pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -392,7 +386,7 @@ def render_git_describe_long(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] - rendered += "-{:d}-g{}".format(pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -426,7 +420,7 @@ def render(pieces, style): elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: - raise ValueError("unknown style '{style}'".format(style=style)) + raise ValueError(f"unknown style '{style}'") return { "version": rendered, diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index 58422811990c4..bebbb38b4aefa 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,2 +1,2 @@ """ public toolkit API """ -from . import extensions, types # noqa +from pandas.api import extensions, indexers, types # noqa diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..3019dd0e9b371 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,13 +1,27 @@ -"""Public API for extending pandas objects.""" -from pandas.core.dtypes.dtypes import ( # noqa: F401 - ExtensionDtype, - register_extension_dtype, -) +""" +Public API for extending pandas objects. +""" + +from pandas._libs.lib import no_default -from pandas.core.accessor import ( # noqa: F401 +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + +from pandas.core.accessor import ( register_dataframe_accessor, register_index_accessor, register_series_accessor, ) -from pandas.core.algorithms import take # noqa: F401 -from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin + +__all__ = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", +] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py new file mode 100644 index 0000000000000..10654eb0888ee --- /dev/null +++ b/pandas/api/indexers/__init__.py @@ -0,0 +1,8 @@ +""" +Public API for Rolling Window Indexers. +""" + +from pandas.core.indexers import check_bool_array_indexer +from pandas.core.window.indexers import BaseIndexer + +__all__ = ["check_bool_array_indexer", "BaseIndexer"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index f32e1abe28cc1..3495b493707c2 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -1,12 +1,23 @@ -""" public toolkit API """ +""" +Public toolkit API. +""" -from pandas._libs.lib import infer_dtype # noqa: F401 +from pandas._libs.lib import infer_dtype from pandas.core.dtypes.api import * # noqa: F403, F401 -from pandas.core.dtypes.concat import union_categoricals # noqa: F401 -from pandas.core.dtypes.dtypes import ( # noqa: F401 +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, ) + +__all__ = [ + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", +] diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 9870b5bed076d..61832a8b6d621 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,6 +4,7 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( + BooleanArray, Categorical, DatetimeArray, IntegerArray, @@ -16,6 +17,7 @@ ) __all__ = [ + "BooleanArray", "Categorical", "DatetimeArray", "IntegerArray", diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 81431db5b867c..60cfecd5804ac 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -12,8 +12,6 @@ import sys import warnings -PY35 = sys.version_info[:2] == (3, 5) -PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" @@ -29,38 +27,82 @@ def set_function_name(f, name, cls): """ - Bind the name/qualname attributes of the function + Bind the name/qualname attributes of the function. """ f.__name__ = name - f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name) + f.__qualname__ = f"{cls.__name__}.{name}" f.__module__ = cls.__module__ return f # https://github.com/pandas-dev/pandas/pull/9123 -def is_platform_little_endian(): - """ am I little endian """ +def is_platform_little_endian() -> bool: + """ + Checking if the running platform is little endian. + + Returns + ------- + bool + True if the running platform is little endian. + """ return sys.byteorder == "little" -def is_platform_windows(): +def is_platform_windows() -> bool: + """ + Checking if the running platform is windows. + + Returns + ------- + bool + True if the running platform is windows. + """ return sys.platform == "win32" or sys.platform == "cygwin" -def is_platform_linux(): +def is_platform_linux() -> bool: + """ + Checking if the running platform is linux. + + Returns + ------- + bool + True if the running platform is linux. + """ return sys.platform == "linux2" -def is_platform_mac(): +def is_platform_mac() -> bool: + """ + Checking if the running platform is mac. + + Returns + ------- + bool + True if the running platform is mac. + """ return sys.platform == "darwin" -def is_platform_32bit(): +def is_platform_32bit() -> bool: + """ + Checking if the running platform is 32-bit. + + Returns + ------- + bool + True if the running platform is 32-bit. + """ return struct.calcsize("P") * 8 < 64 def _import_lzma(): - """Attempts to import lzma, warning the user when lzma is not available. + """ + Importing the `lzma` module. + + Warns + ----- + When the `lzma` module is not available. """ try: import lzma @@ -76,13 +118,23 @@ def _import_lzma(): def _get_lzma_file(lzma): - """Returns the lzma method LZMAFile when the module was correctly imported. - Otherwise, raises a RuntimeError. + """ + Importing the `LZMAFile` class from the `lzma` module. + + Returns + ------- + class + The `LZMAFile` class from the `lzma` module. + + Raises + ------ + RuntimeError + If the `lzma` module was not imported correctly, or didn't exist. """ if lzma is None: raise RuntimeError( "lzma module not available. " - "A Python re-install with the proper " - "dependencies might be required to solve this issue." + "A Python re-install with the proper dependencies, " + "might be required to solve this issue." ) return lzma.LZMAFile diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index cd4e1b7e8aa4d..7aeb0327139f1 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -8,35 +8,29 @@ VERSIONS = { "bs4": "4.6.0", "bottleneck": "1.2.1", - "fastparquet": "0.2.1", + "fastparquet": "0.3.2", "gcsfs": "0.2.2", "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", "odfpy": "1.3.0", - "openpyxl": "2.4.8", + "openpyxl": "2.5.7", "pandas_gbq": "0.8.0", - "pyarrow": "0.9.0", + "pyarrow": "0.13.0", "pytables": "3.4.2", - "s3fs": "0.0.8", + "pytest": "5.0.1", + "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", "tables": "3.4.2", + "tabulate": "0.8.3", "xarray": "0.8.2", "xlrd": "1.1.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", + "numba": "0.46.0", } -message = ( - "Missing optional dependency '{name}'. {extra} " - "Use pip or conda to install {name}." -) -version_message = ( - "Pandas requires version '{minimum_version}' or newer of '{name}' " - "(version '{actual_version}' currently installed)." -) - def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -45,7 +39,7 @@ def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__VERSION__", None) if version is None: - raise ImportError("Can't determine version for {}".format(module.__name__)) + raise ImportError(f"Can't determine version for {module.__name__}") return version @@ -86,11 +80,15 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + msg = ( + f"Missing optional dependency '{name}'. {extra} " + f"Use pip or conda to install {name}." + ) try: module = importlib.import_module(name) except ImportError: if raise_on_missing: - raise ImportError(message.format(name=name, extra=extra)) from None + raise ImportError(msg) from None else: return None @@ -99,8 +97,9 @@ def import_optional_dependency( version = _get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} - msg = version_message.format( - minimum_version=minimum_version, name=name, actual_version=version + msg = ( + f"Pandas requires version '{minimum_version}' or newer of '{name}' " + f"(version '{version}' currently installed)." ) if on_version == "warn": warnings.warn(msg, UserWarning) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 84824207de2a9..588bd24ddf797 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -1,17 +1,33 @@ -from collections import ChainMap +from typing import ChainMap, MutableMapping, TypeVar, cast +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") -class DeepChainMap(ChainMap): - def __setitem__(self, key, value): + +class DeepChainMap(ChainMap[_KT, _VT]): + """Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: for mapping in self.maps: - if key in mapping: - mapping[key] = value + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) + if key in mutable_mapping: + mutable_mapping[key] = value return - self.maps[0][key] = value + cast(MutableMapping[_KT, _VT], self.maps[0])[key] = value - def __delitem__(self, key): + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ for mapping in self.maps: + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) if key in mapping: - del mapping[key] + del mutable_mapping[key] return raise KeyError(key) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 402ed62f2df65..27f1c32058941 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -18,11 +18,11 @@ if _nlv < "1.13.3": raise ImportError( - "this version of pandas is incompatible with " - "numpy < 1.13.3\n" - "your numpy version is {0}.\n" - "Please upgrade numpy to >= 1.13.3 to use " - "this pandas version".format(_np_version) + f"this version of pandas is incompatible with " + f"numpy < 1.13.3\n" + f"your numpy version is {_np_version}.\n" + f"Please upgrade numpy to >= 1.13.3 to use " + f"this pandas version" ) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c2fe7d1dd12f4..7158f251ad805 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -58,9 +58,7 @@ def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=No fname, args, kwargs, max_fname_arg_count, self.defaults ) else: - raise ValueError( - "invalid validation method '{method}'".format(method=method) - ) + raise ValueError(f"invalid validation method '{method}'") ARGMINMAX_DEFAULTS = dict(out=None) @@ -108,7 +106,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -124,7 +122,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() # type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -171,14 +169,7 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] -COMPRESS_DEFAULTS["axis"] = None -COMPRESS_DEFAULTS["out"] = None -validate_compress = CompatValidator( - COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 -) - -CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] +CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -204,7 +195,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -226,28 +217,28 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str] +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any] +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any] +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]] +STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -275,13 +266,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]] +TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") @@ -312,9 +303,8 @@ def validate_take_with_convert(convert, args, kwargs): def validate_window_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .{func}() directly instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .{name}() directly instead " ) if len(args) > 0: @@ -328,9 +318,8 @@ def validate_window_func(name, args, kwargs): def validate_rolling_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .rolling(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .rolling(...).{name}() instead " ) if len(args) > 0: @@ -344,9 +333,8 @@ def validate_rolling_func(name, args, kwargs): def validate_expanding_func(name, args, kwargs): numpy_args = ("axis", "dtype", "out") msg = ( - "numpy operations are not " - "valid with window objects. " - "Use .expanding(...).{func}() instead ".format(func=name) + f"numpy operations are not valid with window objects. " + f"Use .expanding(...).{name}() instead " ) if len(args) > 0: @@ -371,11 +359,9 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with groupby. Use .groupby(...)." - "{func}() instead".format(func=name) - ) + f"numpy operations are not valid with " + f"groupby. Use .groupby(...).{name}() " + f"instead" ) @@ -391,11 +377,9 @@ def validate_resampler_func(method, args, kwargs): if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: raise UnsupportedFunctionCall( - ( - "numpy operations are not valid " - "with resample. Use .resample(...)." - "{func}() instead".format(func=method) - ) + f"numpy operations are not " + f"valid with resample. Use " + f".resample(...).{method}() instead" ) else: raise TypeError("too many arguments passed in") @@ -418,7 +402,4 @@ def validate_minmax_axis(axis): if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): - raise ValueError( - "`axis` must be fewer than the number of " - "dimensions ({ndim})".format(ndim=ndim) - ) + raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})") diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 3a36713ccdbda..0a1a1376bfc8d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -4,8 +4,7 @@ import copy import pickle as pkl -import sys -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import warnings from pandas import Index @@ -25,14 +24,14 @@ def load_reduce(self): try: stack[-1] = func(*args) return - except Exception as e: + except TypeError as err: # If we have a deprecated function, # try to replace and try again. msg = "_reconstruct: First argument must be a sub-type of ndarray" - if msg in str(e): + if msg in str(err): try: cls = args[0] stack[-1] = object.__new__(cls) @@ -40,22 +39,6 @@ def load_reduce(self): except TypeError: pass - # try to re-encode the arguments - if getattr(self, "encoding", None) is not None: - args = tuple( - arg.encode(self.encoding) if isinstance(arg, str) else arg - for arg in args - ) - try: - stack[-1] = func(*args) - return - except TypeError: - pass - - # unknown exception, re-raise - if getattr(self, "is_verbose", None): - print(sys.exc_info()) - print(func, args) raise @@ -81,7 +64,7 @@ def __new__(cls) -> "Series": # type: ignore stacklevel=6, ) - return Series() + return Series(dtype=object) class _LoadSparseFrame: @@ -106,21 +89,8 @@ def __new__(cls) -> "DataFrame": # type: ignore _class_locations_map = { ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), # 15477 - # - # TODO: When FrozenNDArray is removed, add - # the following lines for compat: - # - # ('pandas.core.base', 'FrozenNDArray'): - # ('numpy', 'ndarray'), - # ('pandas.core.indexes.frozen', 'FrozenNDArray'): - # ('numpy', 'ndarray'), - # - # Afterwards, remove the current entry - # for `pandas.core.base.FrozenNDArray`. - ("pandas.core.base", "FrozenNDArray"): ( - "pandas.core.indexes.frozen", - "FrozenNDArray", - ), + ("pandas.core.base", "FrozenNDArray"): ("numpy", "ndarray"), + ("pandas.core.indexes.frozen", "FrozenNDArray"): ("numpy", "ndarray"), ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), # 10890 ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), @@ -199,9 +169,9 @@ def __new__(cls) -> "DataFrame": # type: ignore # our Unpickler sub-class to override methods and some dispatcher -# functions for compat - +# functions for compat and uses a non-public class of the pickle module. +# error: Name 'pkl._Unpickler' is not defined class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass @@ -249,8 +219,9 @@ def load_newobj_ex(self): pass -def load(fh, encoding=None, is_verbose=False): - """load a pickle, with a provided encoding +def load(fh, encoding: Optional[str] = None, is_verbose: bool = False): + """ + Load a pickle, with a provided encoding, Parameters ---------- diff --git a/pandas/conftest.py b/pandas/conftest.py index b032e14d8f7e1..3eab2186ccb94 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,4 @@ +from collections import abc from datetime import date, time, timedelta, timezone from decimal import Decimal import operator @@ -14,8 +15,8 @@ import pandas as pd from pandas import DataFrame +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm hypothesis.settings.register_profile( "ci", @@ -88,7 +89,7 @@ def spmatrix(request): return getattr(sparse, request.param + "_matrix") -@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") def axis(request): """ Fixture for returning the axis numbers of a DataFrame. @@ -99,7 +100,7 @@ def axis(request): axis_frame = axis -@pytest.fixture(params=[0, "index"], ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") def axis_series(request): """ Fixture for returning the axis numbers of a Series. @@ -163,7 +164,7 @@ def ordered_fixture(request): @pytest.fixture(params=_all_arithmetic_operators) def all_arithmetic_operators(request): """ - Fixture for dunder names for common arithmetic operations + Fixture for dunder names for common arithmetic operations. """ return request.param @@ -190,7 +191,9 @@ def all_arithmetic_functions(request): """ Fixture for operator and roperator arithmetic functions. - Note: This includes divmod and rdivmod, whereas all_arithmetic_operators + Notes + ----- + This includes divmod and rdivmod, whereas all_arithmetic_operators does not. """ return request.param @@ -213,7 +216,7 @@ def all_arithmetic_functions(request): @pytest.fixture(params=_all_numeric_reductions) def all_numeric_reductions(request): """ - Fixture for numeric reduction names + Fixture for numeric reduction names. """ return request.param @@ -224,7 +227,7 @@ def all_numeric_reductions(request): @pytest.fixture(params=_all_boolean_reductions) def all_boolean_reductions(request): """ - Fixture for boolean reduction names + Fixture for boolean reduction names. """ return request.param @@ -251,7 +254,7 @@ def _get_cython_table_params(ndframe, func_names_and_expected): Returns ------- - results : list + list List of three items (DataFrame, function, expected result) """ results = [] @@ -293,10 +296,24 @@ def compare_operators_no_eq_ne(request): return request.param +@pytest.fixture( + params=["__and__", "__rand__", "__or__", "__ror__", "__xor__", "__rxor__"] +) +def all_logical_operators(request): + """ + Fixture for dunder names for common logical operations + + * | + * & + * ^ + """ + return request.param + + @pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) def compression(request): """ - Fixture for trying common compression types in compression tests + Fixture for trying common compression types in compression tests. """ return request.param @@ -305,7 +322,7 @@ def compression(request): def compression_only(request): """ Fixture for trying common compression types in compression tests excluding - uncompressed case + uncompressed case. """ return request.param @@ -313,7 +330,7 @@ def compression_only(request): @pytest.fixture(params=[True, False]) def writable(request): """ - Fixture that an array is writable + Fixture that an array is writable. """ return request.param @@ -326,7 +343,7 @@ def datetime_tz_utc(): @pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) def utc_fixture(request): """ - Fixture to provide variants of UTC timezone strings and tzinfo objects + Fixture to provide variants of UTC timezone strings and tzinfo objects. """ return request.param @@ -334,7 +351,7 @@ def utc_fixture(request): @pytest.fixture(params=["inner", "outer", "left", "right"]) def join_type(request): """ - Fixture for trying all types of join operations + Fixture for trying all types of join operations. """ return request.param @@ -356,7 +373,7 @@ def datapath(strict_data_files): Returns ------- - path : path including ``pandas/tests``. + path including ``pandas/tests``. Raises ------ @@ -369,11 +386,11 @@ def deco(*args): path = os.path.join(BASE_PATH, *args) if not os.path.exists(path): if strict_data_files: - msg = "Could not find file {} and --strict-data-files is set." - raise ValueError(msg.format(path)) + raise ValueError( + f"Could not find file {path} and --strict-data-files is set." + ) else: - msg = "Could not find {}." - pytest.skip(msg.format(path)) + pytest.skip(f"Could not find {path}.") return path return deco @@ -390,7 +407,7 @@ def iris(datapath): @pytest.fixture(params=["nlargest", "nsmallest"]) def nselect_method(request): """ - Fixture for trying all nselect methods + Fixture for trying all nselect methods. """ return request.param @@ -398,7 +415,7 @@ def nselect_method(request): @pytest.fixture(params=["left", "right", "both", "neither"]) def closed(request): """ - Fixture for trying all interval closed parameters + Fixture for trying all interval closed parameters. """ return request.param @@ -406,7 +423,7 @@ def closed(request): @pytest.fixture(params=["left", "right", "both", "neither"]) def other_closed(request): """ - Secondary closed fixture to allow parametrizing over all pairs of closed + Secondary closed fixture to allow parametrizing over all pairs of closed. """ return request.param @@ -414,7 +431,7 @@ def other_closed(request): @pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")]) def nulls_fixture(request): """ - Fixture for each null type in pandas + Fixture for each null type in pandas. """ return request.param @@ -425,7 +442,7 @@ def nulls_fixture(request): @pytest.fixture(params=[None, np.nan, pd.NaT]) def unique_nulls_fixture(request): """ - Fixture for each null type in pandas, each null type exactly once + Fixture for each null type in pandas, each null type exactly once. """ return request.param @@ -575,7 +592,6 @@ def float_dtype(request): * 'float32' * 'float64' """ - return request.param @@ -588,7 +604,6 @@ def complex_dtype(request): * 'complex64' * 'complex128' """ - return request.param @@ -603,7 +618,6 @@ def sint_dtype(request): * 'int32' * 'int64' """ - return request.param @@ -617,7 +631,6 @@ def uint_dtype(request): * 'uint32' * 'uint64' """ - return request.param @@ -636,6 +649,23 @@ def any_int_dtype(request): * 'int64' * 'uint64' """ + return request.param + + +@pytest.fixture(params=ALL_EA_INT_DTYPES) +def any_nullable_int_dtype(request): + """ + Parameterized fixture for any nullable integer dtype. + + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + """ return request.param @@ -658,7 +688,6 @@ def any_real_dtype(request): * 'float32' * 'float64' """ - return request.param @@ -696,7 +725,6 @@ def any_numpy_dtype(request): * object * 'object' """ - return request.param @@ -854,3 +882,51 @@ def float_frame(): [30 rows x 4 columns] """ return DataFrame(tm.getSeriesData()) + + +@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) +def index_or_series(request): + """ + Fixture to parametrize over Index and Series, made necessary by a mypy + bug, giving an error: + + List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" + + See GH#29725 + """ + return request.param + + +@pytest.fixture +def dict_subclass(): + """ + Fixture for a dictionary subclass. + """ + + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + return TestSubDict + + +@pytest.fixture +def non_mapping_dict_subclass(): + """ + Fixture for a non-mapping dictionary subclass. + """ + + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict + + def __getitem__(self, key): + return self._data.__getitem__(key) + + def __iter__(self): + return self._data.__iter__() + + def __len__(self): + return self._data.__len__() + + return TestNonDictMapping diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index fc60c01d7b808..3f1c7b1c049cf 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -11,8 +11,8 @@ class DirNamesMixin: - _accessors = set() # type: Set[str] - _deprecations = frozenset() # type: FrozenSet[str] + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset() def _dir_deletions(self): """ @@ -35,7 +35,10 @@ def _dir_additions(self): def __dir__(self): """ - Provide method name lookup and completion + Provide method name lookup and completion. + + Notes + ----- Only provide 'public' methods. """ rv = set(dir(type(self))) @@ -45,31 +48,36 @@ def __dir__(self): class PandasDelegate: """ - An abstract base class for delegating methods/properties. + Abstract base class for delegating methods/properties. """ def _delegate_property_get(self, name, *args, **kwargs): - raise TypeError("You cannot access the property {name}".format(name=name)) + raise TypeError(f"You cannot access the property {name}") def _delegate_property_set(self, name, value, *args, **kwargs): - raise TypeError("The property {name} cannot be set".format(name=name)) + raise TypeError(f"The property {name} cannot be set") def _delegate_method(self, name, *args, **kwargs): - raise TypeError("You cannot call method {name}".format(name=name)) + raise TypeError(f"You cannot call method {name}") @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): + def _add_delegate_accessors( + cls, delegate, accessors, typ: str, overwrite: bool = False + ): """ Add accessors to cls from the delegate class. Parameters ---------- - cls : the class to add the methods/properties to - delegate : the class to get methods/properties & doc-strings - accessors : string list of accessors to add - typ : 'property' or 'method' - overwrite : boolean, default False - Overwrite the method/property in the target class if it exists. + cls + Class to add the methods/properties to. + delegate + Class to get methods/properties and doc-strings. + accessors : list of str + List of accessors to add. + typ : {'property', 'method'} + overwrite : bool, default False + Overwrite the method/property in the target class if it exists. """ def _create_delegator_property(name): @@ -107,7 +115,7 @@ def f(self, *args, **kwargs): setattr(cls, name, f) -def delegate_names(delegate, accessors, typ, overwrite=False): +def delegate_names(delegate, accessors, typ: str, overwrite: bool = False): """ Add delegated names to a class using a class decorator. This provides an alternative usage to directly calling `_add_delegate_accessors` @@ -120,7 +128,7 @@ def delegate_names(delegate, accessors, typ, overwrite=False): accessors : Sequence[str] List of accessor to add. typ : {'property', 'method'} - overwrite : boolean, default False + overwrite : bool, default False Overwrite the method/property in the target class if it exists. Returns @@ -150,19 +158,25 @@ def add_delegate_accessors(cls): class CachedAccessor: """ - Custom property-like object (descriptor) for caching accessors. + Custom property-like object. + + A descriptor for caching accessors. Parameters ---------- name : str - The namespace this will be accessed under, e.g. ``df.foo``. + Namespace that will be accessed under, e.g. ``df.foo``. accessor : cls - The class with the extension methods. The class' __init__ method - should expect one of a ``Series``, ``DataFrame`` or ``Index`` as - the single argument ``data``. + Class with the extension methods. + + Notes + ----- + For accessor, The class's __init__ method assumes that one of + ``Series``, ``DataFrame`` or ``Index`` as the + single argument ``data``. """ - def __init__(self, name, accessor): + def __init__(self, name: str, accessor) -> None: self._name = name self._accessor = accessor @@ -183,9 +197,9 @@ def _register_accessor(name, cls): def decorator(accessor): if hasattr(cls, name): warnings.warn( - "registration of accessor {!r} under name {!r} for type " - "{!r} is overriding a preexisting attribute with the same " - "name.".format(accessor, name, cls), + f"registration of accessor {repr(accessor)} under name " + f"{repr(name)} for type {repr(cls)} is overriding a preexisting" + f"attribute with the same name.", UserWarning, stacklevel=2, ) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 98a090ef26f2a..39e8e9008a844 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,17 +3,18 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn import numpy as np -from pandas._libs import algos, hashtable as htable, lib +from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT -from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, + infer_dtype_from_array, maybe_promote, ) from pandas.core.dtypes.common import ( @@ -28,7 +29,6 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, @@ -46,11 +46,14 @@ from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas.core import common as com +import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices -_shared_docs = {} # type: Dict[str, str] +if TYPE_CHECKING: + from pandas import Series + +_shared_docs: Dict[str, str] = {} # --------------- # @@ -109,7 +112,7 @@ def _ensure_data(values, dtype=None): except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype - # and it is incompat this will fall thru to here + # and it is incompat this will fall through to here return ensure_object(values), "object" # datetimelike @@ -391,20 +394,15 @@ def isin(comps, values) -> np.ndarray: ndarray[bool] Same length as `comps`. """ - if not is_list_like(comps): raise TypeError( - "only list-like objects are allowed to be passed" - " to isin(), you passed a [{comps_type}]".format( - comps_type=type(comps).__name__ - ) + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(comps).__name__}]" ) if not is_list_like(values): raise TypeError( - "only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]".format( - values_type=type(values).__name__ - ) + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{type(values).__name__}]" ) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): @@ -425,7 +423,7 @@ def isin(comps, values) -> np.ndarray: # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - if len(comps) > 1000000 and not is_object_dtype(comps): + if len(comps) > 1_000_000 and not is_object_dtype(comps): f = np.in1d elif is_integer_dtype(comps): try: @@ -448,9 +446,11 @@ def isin(comps, values) -> np.ndarray: return f(comps, values) -def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None): +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None +) -> Tuple[np.ndarray, np.ndarray]: """ - Factorize an array-like to labels and uniques. + Factorize an array-like to codes and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -468,18 +468,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Returns ------- - labels : ndarray + codes : ndarray uniques : ndarray """ hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize( - values, na_sentinel=na_sentinel, na_value=na_value - ) + uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) - labels = ensure_platform_int(labels) - return labels, uniques + codes = ensure_platform_int(codes) + return codes, uniques _shared_docs[ @@ -494,16 +492,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Parameters ---------- - %(values)s%(sort)s%(order)s + %(values)s%(sort)s na_sentinel : int, default -1 Value to mark "not found". %(size_hint)s\ Returns ------- - labels : ndarray + codes : ndarray An integer ndarray that's an indexer into `uniques`. - ``uniques.take(labels)`` will have the same values as `values`. + ``uniques.take(codes)`` will have the same values as `values`. uniques : ndarray, Index, or Categorical The unique valid values. When `values` is Categorical, `uniques` is a Categorical. When `values` is some other pandas object, an @@ -525,27 +523,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non ``pd.factorize(values)``. The results are identical for methods like :meth:`Series.factorize`. - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) - >>> labels + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> codes array([0, 0, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) - With ``sort=True``, the `uniques` will be sorted, and `labels` will be + With ``sort=True``, the `uniques` will be sorted, and `codes` will be shuffled so that the relationship is the maintained. - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) - >>> labels + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> codes array([1, 1, 0, 2, 1]) >>> uniques array(['a', 'b', 'c'], dtype=object) - Missing values are indicated in `labels` with `na_sentinel` + Missing values are indicated in `codes` with `na_sentinel` (``-1`` by default). Note that missing values are never included in `uniques`. - >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) - >>> labels + >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> codes array([ 0, -1, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) @@ -555,8 +553,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non will differ. For Categoricals, a `Categorical` is returned. >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels + >>> codes, uniques = pd.factorize(cat) + >>> codes array([0, 0, 1]) >>> uniques [a, c] @@ -569,8 +567,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non returned. >>> cat = pd.Series(['a', 'a', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels + >>> codes, uniques = pd.factorize(cat) + >>> codes array([0, 0, 1]) >>> uniques Index(['a', 'c'], dtype='object') @@ -585,18 +583,10 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non coerced to ndarrays before factorization. """ ), - order=dedent( - """\ - order : None - .. deprecated:: 0.23.0 - - This parameter has no effect and is deprecated. - """ - ), sort=dedent( """\ sort : bool, default False - Sort `uniques` and shuffle `labels` to maintain the + Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), @@ -608,12 +598,13 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non ), ) @Appender(_shared_docs["factorize"]) -@deprecate_kwarg(old_arg_name="order", new_arg_name=None) -def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None): +def factorize( + values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None +) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) - # 2.) factorizing labels and uniques - # 3.) Maybe boxing the output in an Index + # 2.) factorizing codes and uniques + # 3.) Maybe boxing the uniques in an Index # # Step 2 is dispatched to extension types (like Categorical). They are # responsible only for factorization. All data coercion, sorting and boxing @@ -624,7 +615,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= if is_extension_array_dtype(values): values = extract_array(values) - labels, uniques = values.factorize(na_sentinel=na_sentinel) + codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype = _ensure_data(values) @@ -634,15 +625,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= else: na_value = None - labels, uniques = _factorize_array( + codes, uniques = _factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) if sort and len(uniques) > 0: - from pandas.core.sorting import safe_sort - - uniques, labels = safe_sort( - uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False + uniques, codes = safe_sort( + uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) uniques = _reconstruct_data(uniques, dtype, original) @@ -655,7 +644,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= uniques = Index(uniques) - return labels, uniques + return codes, uniques def value_counts( @@ -665,7 +654,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> ABCSeries: +) -> "Series": """ Compute a histogram of the counts of non-null values. @@ -767,7 +756,7 @@ def _value_counts_arraylike(values, dropna: bool): # ndarray like # TODO: handle uint8 - f = getattr(htable, "value_count_{dtype}".format(dtype=ndtype)) + f = getattr(htable, f"value_count_{ndtype}") keys, counts = f(values, dropna) mask = isna(values) @@ -803,11 +792,11 @@ def duplicated(values, keep="first") -> np.ndarray: values, _ = _ensure_data(values) ndtype = values.dtype.name - f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) + f = getattr(htable, f"duplicated_{ndtype}") return f(values, keep=keep) -def mode(values, dropna: bool = True) -> ABCSeries: +def mode(values, dropna: bool = True) -> "Series": """ Returns the mode(s) of an array. @@ -835,19 +824,19 @@ def mode(values, dropna: bool = True) -> ABCSeries: return Series(values.values.mode(dropna=dropna), name=values.name) return values.mode(dropna=dropna) - if dropna and is_datetimelike(values): + if dropna and needs_i8_conversion(values.dtype): mask = values.isnull() values = values[~mask] values, _ = _ensure_data(values) ndtype = values.dtype.name - f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) + f = getattr(htable, f"mode_{ndtype}") result = f(values, dropna=dropna) try: result = np.sort(result) - except TypeError as e: - warn("Unable to sort modes: {error}".format(error=e)) + except TypeError as err: + warn(f"Unable to sort modes: {err}") result = _reconstruct_data(result, original.dtype, original) return Series(result) @@ -1032,7 +1021,8 @@ def quantile(x, q, interpolation_method="fraction"): values = np.sort(x) def _interpolate(a, b, fraction): - """Returns the point at the given fraction between a and b, where + """ + Returns the point at the given fraction between a and b, where 'fraction' must be between 0 and 1. """ return a + (b - a) * fraction @@ -1089,7 +1079,7 @@ def nsmallest(self): return self.compute("nsmallest") @staticmethod - def is_valid_dtype_n_method(dtype): + def is_valid_dtype_n_method(dtype) -> bool: """ Helper function to determine if dtype is valid for nsmallest/nlargest methods @@ -1119,10 +1109,7 @@ def compute(self, method): n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError( - "Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, dtype=dtype) - ) + raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") if n <= 0: return self.obj[[]] @@ -1155,7 +1142,7 @@ def compute(self, method): n = min(n, narr) kth_val = algos.kth_smallest(arr.copy(), n - 1) - ns, = np.nonzero(arr <= kth_val) + (ns,) = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind="mergesort")] if self.keep != "all": @@ -1203,14 +1190,13 @@ def compute(self, method): dtype = frame[column].dtype if not self.is_valid_dtype_n_method(dtype): raise TypeError( - ( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method) + f"Column {repr(column)} has dtype {dtype}, " + f"cannot use method {repr(method)} with this dtype" ) def get_indexer(current_indexer, other_indexer): - """Helper function to concat `current_indexer` and `other_indexer` + """ + Helper function to concat `current_indexer` and `other_indexer` depending on `method` """ if method == "nsmallest": @@ -1440,7 +1426,9 @@ def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): } -def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis: int = 0, mask_info=None): +def _get_take_nd_function( + ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None +): if ndim <= 2: tup = (arr_dtype.name, out_dtype.name) if ndim == 1: @@ -1474,7 +1462,7 @@ def func2(arr, indexer, out, fill_value=np.nan): return func2 -def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None): +def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): """ Take elements from an array. @@ -1568,13 +1556,7 @@ def take(arr, indices, axis=0, allow_fill: bool = False, fill_value=None): def take_nd( - arr, - indexer, - axis=0, - out=None, - fill_value=np.nan, - mask_info=None, - allow_fill: bool = True, + arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True ): """ Specialized Cython take which sets NaN values in one pass @@ -1597,10 +1579,6 @@ def take_nd( maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with - mask_info : tuple of (ndarray, boolean) - If provided, value should correspond to: - (indexer != -1, (indexer != -1).any()) - If not provided, it will be computed internally if necessary allow_fill : boolean, default True If False, indexer is assumed to contain no -1 values so no filling will be done. This short-circuits computation of a mask. Result is @@ -1611,6 +1589,7 @@ def take_nd( subarray : array-like May be the same type as the input, or cast to an ndarray. """ + mask_info = None if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) @@ -1632,12 +1611,9 @@ def take_nd( dtype, fill_value = maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer - if mask_info is not None: - mask, needs_masking = mask_info - else: - mask = indexer == -1 - needs_masking = mask.any() - mask_info = mask, needs_masking + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError("Incompatible type for fill_value") @@ -1688,7 +1664,7 @@ def take_nd( def take_2d_multi(arr, indexer, fill_value=np.nan): """ - Specialized Cython take which sets NaN values in one pass + Specialized Cython take which sets NaN values in one pass. """ # This is only called from one place in DataFrame._reindex_multi, # so we know indexer is well-behaved. @@ -1818,12 +1794,12 @@ def searchsorted(arr, value, side="left", sorter=None): elif not ( is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) ): - from pandas.core.series import Series - # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value - value_ser = Series(value)._values + value_ser = array([value]) if is_scalar(value) else array(value) value = value_ser[0] if is_scalar(value) else value_ser + if isinstance(value, Timestamp) and value.tzinfo is None: + value = value.to_datetime64() result = arr.searchsorted(value, side=side, sorter=sorter) return result @@ -1881,8 +1857,9 @@ def diff(arr, n: int, axis: int = 0): out_arr[tuple(na_indexer)] = na if arr.ndim == 2 and arr.dtype.name in _diff_special: - f = algos.diff_2d - f(arr, out_arr, n, axis) + # TODO: can diff_2d dtype specialization troubles be fixed by defining + # out_arr inside diff_2d? + algos.diff_2d(arr, out_arr, n, axis) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -1919,3 +1896,139 @@ def diff(arr, n: int, axis: int = 0): out_arr = out_arr.astype("int64").view("timedelta64[ns]") return out_arr + + +# -------------------------------------------------------------------- +# Helper functions + +# Note: safe_sort is in algorithms.py instead of sorting.py because it is +# low-dependency, is used in this module, and used private methods from +# this module. +def safe_sort( + values, + codes=None, + na_sentinel: int = -1, + assume_unique: bool = False, + verify: bool = True, +) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: + """ + Sort ``values`` and reorder corresponding ``codes``. + + ``values`` should be unique if ``codes`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``codes`` is not None. + codes : list_like, optional + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``codes`` to mark "not found". + Ignored when ``codes`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``codes`` is None. + verify : bool, default True + Check if codes are out of bound for the values and put out of bound + codes equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound codes. Ignored when ``codes`` is None. + + .. versionadded:: 0.25.0 + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_codes : ndarray + Reordered ``codes``; returned when ``codes`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``codes`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``codes`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError( + "Only list-like objects are allowed to be passed to safe_sort as values" + ) + + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): + # don't convert to string types + dtype, _ = infer_dtype_from_array(values) + values = np.asarray(values, dtype=dtype) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + sorter = None + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # codes: + + if codes is None: + return ordered + + if not is_list_like(codes): + raise TypeError( + "Only list-like objects or None are allowed to " + "be passed to safe_sort as codes" + ) + codes = ensure_platform_int(np.asarray(codes)) + + from pandas import Index + + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if codes is not None") + + if sorter is None: + # mixed types + hash_klass, values = _get_data_algo(values) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = ensure_platform_int(t.lookup(ordered)) + + if na_sentinel == -1: + # take_1d is faster, but only works for na_sentinels of -1 + order2 = sorter.argsort() + new_codes = take_1d(order2, codes, fill_value=-1) + if verify: + mask = (codes < -len(values)) | (codes >= len(values)) + else: + mask = None + else: + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + # Out of bound indices will be masked with `na_sentinel` next, so we + # may deal with them here without performance loss using `mode='wrap'` + new_codes = reverse_indexer.take(codes, mode="wrap") + + mask = codes == na_sentinel + if verify: + mask = mask | (codes < -len(values)) | (codes >= len(values)) + + if mask is not None: + np.putmask(new_codes, mask, na_sentinel) + + return ordered, ensure_platform_int(new_codes) diff --git a/pandas/core/api.py b/pandas/core/api.py index 04f2f84c92a15..b0b65f9d0be34 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -1,6 +1,7 @@ # flake8: noqa -import numpy as np +from pandas._libs import NaT, Period, Timedelta, Timestamp +from pandas._libs.missing import NA from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -10,9 +11,9 @@ ) from pandas.core.dtypes.missing import isna, isnull, notna, notnull -# TODO: Remove get_dummies import when statsmodels updates #18264 from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -26,7 +27,7 @@ from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array from pandas.core.groupby import Grouper, NamedAgg -from pandas.core.index import ( +from pandas.core.indexes.api import ( CategoricalIndex, DatetimeIndex, Float64Index, @@ -34,18 +35,16 @@ Int64Index, IntervalIndex, MultiIndex, - NaT, PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, ) -from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.indexes.interval import Interval, interval_range -from pandas.core.indexes.period import Period, period_range -from pandas.core.indexes.timedeltas import Timedelta, timedelta_range +from pandas.core.indexes.period import period_range +from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexing import IndexSlice -from pandas.core.reshape.reshape import get_dummies from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime from pandas.core.tools.numeric import to_numeric diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f402154dc91ca..14a3c3c008e92 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,4 +1,6 @@ +import abc import inspect +from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union import numpy as np @@ -7,28 +9,34 @@ from pandas.core.dtypes.common import ( is_dict_like, - is_extension_type, + is_extension_array_dtype, is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries -from pandas.io.formats.printing import pprint_thing +from pandas.core.construction import create_series_with_explicit_dtype + +if TYPE_CHECKING: + from pandas import DataFrame, Series, Index + +ResType = Dict[int, Any] def frame_apply( - obj, + obj: "DataFrame", func, axis=0, - raw=False, + raw: bool = False, result_type=None, - ignore_failures=False, + ignore_failures: bool = False, args=None, kwds=None, ): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) + klass: Type[FrameApply] if axis == 0: klass = FrameRowApply elif axis == 1: @@ -45,8 +53,45 @@ def frame_apply( ) -class FrameApply: - def __init__(self, obj, func, raw, result_type, ignore_failures, args, kwds): +class FrameApply(metaclass=abc.ABCMeta): + + # --------------------------------------------------------------- + # Abstract Methods + axis: int + + @property + @abc.abstractmethod + def result_index(self) -> "Index": + pass + + @property + @abc.abstractmethod + def result_columns(self) -> "Index": + pass + + @property + @abc.abstractmethod + def series_generator(self) -> Iterator["Series"]: + pass + + @abc.abstractmethod + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + pass + + # --------------------------------------------------------------- + + def __init__( + self, + obj: "DataFrame", + func, + raw: bool, + result_type, + ignore_failures: bool, + args, + kwds, + ): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures @@ -72,17 +117,16 @@ def f(x): self.f = f - # results - self.result = None - self.res_index = None - self.res_columns = None + @property + def res_columns(self) -> "Index": + return self.result_columns @property - def columns(self): + def columns(self) -> "Index": return self.obj.columns @property - def index(self): + def index(self) -> "Index": return self.obj.index @cache_readonly @@ -90,11 +134,11 @@ def values(self): return self.obj.values @cache_readonly - def dtypes(self): + def dtypes(self) -> "Series": return self.obj.dtypes @property - def agg_axis(self): + def agg_axis(self) -> "Index": return self.obj._get_agg_axis(self.axis) def get_result(self): @@ -129,7 +173,7 @@ def get_result(self): # broadcasting if self.result_type == "broadcast": - return self.apply_broadcast() + return self.apply_broadcast(self.obj) # one axis empty elif not all(self.obj.shape): @@ -161,7 +205,7 @@ def apply_empty_result(self): if not should_reduce: try: - r = self.f(Series([])) + r = self.f(Series([], dtype=np.float64)) except Exception: pass else: @@ -169,7 +213,7 @@ def apply_empty_result(self): if should_reduce: if len(self.agg_axis): - r = self.f(Series([])) + r = self.f(Series([], dtype=np.float64)) else: r = np.nan @@ -185,6 +229,8 @@ def apply_raw(self): if "Function does not reduce" not in str(err): # catch only ValueError raised intentionally in libreduction raise + # We expect np.apply_along_axis to give a two-dimensional result, or + # also raise. result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case @@ -193,7 +239,7 @@ def apply_raw(self): else: return self.obj._constructor_sliced(result, index=self.agg_axis) - def apply_broadcast(self, target): + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": result_values = np.empty_like(target.values) # axis which we want to compare compliance @@ -230,10 +276,10 @@ def apply_standard(self): # as demonstrated in gh-12244 if ( self.result_type in ["reduce", None] - and not self.dtypes.apply(is_extension_type).any() + and not self.dtypes.apply(is_extension_array_dtype).any() # Disallow complex_internals since libreduction shortcut # cannot handle MultiIndex - and not self.agg_axis._has_complex_internals + and not isinstance(self.agg_axis, ABCMultiIndex) ): values = self.values @@ -265,16 +311,15 @@ def apply_standard(self): return self.obj._constructor_sliced(result, index=labels) # compute the result using the series generator - self.apply_series_generator() + results, res_index = self.apply_series_generator() # wrap results - return self.wrap_results() + return self.wrap_results(results, res_index) - def apply_series_generator(self): + def apply_series_generator(self) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index - i = None keys = [] results = {} if self.ignore_failures: @@ -293,36 +338,35 @@ def apply_series_generator(self): res_index = res_index.take(successes) else: - try: - for i, v in enumerate(series_gen): - results[i] = self.f(v) - keys.append(v.name) - except Exception as err: - if hasattr(err, "args"): - - # make sure i is defined - if i is not None: - k = res_index[i] - err.args = err.args + ( - "occurred at index %s" % pprint_thing(k), - ) - raise + for i, v in enumerate(series_gen): + results[i] = self.f(v) + keys.append(v.name) - self.results = results - self.res_index = res_index - self.res_columns = self.result_columns + return results, res_index - def wrap_results(self): - results = self.results + def wrap_results( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: + from pandas import Series # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis() + return self.wrap_results_for_axis(results, res_index) # dict of scalars - result = self.obj._constructor_sliced(results) - result.index = self.res_index + + # the default dtype of an empty Series will be `object`, but this + # code can be hit by df.mean() where the result should have dtype + # float64 even if it's an empty Series. + constructor_sliced = self.obj._constructor_sliced + if constructor_sliced is Series: + result = create_series_with_explicit_dtype( + results, dtype_if_empty=np.float64 + ) + else: + result = constructor_sliced(results) + result.index = res_index return result @@ -330,33 +374,34 @@ def wrap_results(self): class FrameRowApply(FrameApply): axis = 0 - def apply_broadcast(self): - return super().apply_broadcast(self.obj) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + return super().apply_broadcast(target) @property def series_generator(self): return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property - def result_index(self): + def result_index(self) -> "Index": return self.columns @property - def result_columns(self): + def result_columns(self) -> "Index": return self.index - def wrap_results_for_axis(self): + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> "DataFrame": """ return the results for the rows """ - results = self.results result = self.obj._constructor(data=results) if not isinstance(results[0], ABCSeries): if len(result.index) == len(self.res_columns): result.index = self.res_columns - if len(result.columns) == len(self.res_index): - result.columns = self.res_index + if len(result.columns) == len(res_index): + result.columns = res_index return result @@ -364,8 +409,8 @@ def wrap_results_for_axis(self): class FrameColumnApply(FrameApply): axis = 1 - def apply_broadcast(self): - result = super().apply_broadcast(self.obj.T) + def apply_broadcast(self, target: "DataFrame") -> "DataFrame": + result = super().apply_broadcast(target.T) return result.T @property @@ -377,43 +422,44 @@ def series_generator(self): ) @property - def result_index(self): + def result_index(self) -> "Index": return self.index @property - def result_columns(self): + def result_columns(self) -> "Index": return self.columns - def wrap_results_for_axis(self): + def wrap_results_for_axis( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: """ return the results for the columns """ - results = self.results + result: Union["Series", "DataFrame"] # we have requested to expand if self.result_type == "expand": - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results, res_index) # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series result = Series(results) - result.index = self.res_index + result.index = res_index # we may want to infer results else: - result = self.infer_to_same_shape() + result = self.infer_to_same_shape(results, res_index) return result - def infer_to_same_shape(self): + def infer_to_same_shape(self, results: ResType, res_index: "Index") -> "DataFrame": """ infer the results to the same shape as the input object """ - results = self.results result = self.obj._constructor(data=results) result = result.T # set the index - result.index = self.res_index + result.index = res_index # infer dtypes result = result.infer_objects() diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 868118bac6a7b..bf3469924a700 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,14 +1,36 @@ -from .base import ( # noqa: F401 +from pandas.core.arrays.base import ( ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, + try_cast_to_ea, ) -from .categorical import Categorical # noqa: F401 -from .datetimes import DatetimeArray # noqa: F401 -from .integer import IntegerArray, integer_array # noqa: F401 -from .interval import IntervalArray # noqa: F401 -from .numpy_ import PandasArray, PandasDtype # noqa: F401 -from .period import PeriodArray, period_array # noqa: F401 -from .sparse import SparseArray # noqa: F401 -from .string_ import StringArray # noqa: F401 -from .timedeltas import TimedeltaArray # noqa: F401 +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.integer import IntegerArray, integer_array +from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays.period import PeriodArray, period_array +from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.timedeltas import TimedeltaArray + +__all__ = [ + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", + "try_cast_to_ea", + "BooleanArray", + "Categorical", + "DatetimeArray", + "IntegerArray", + "integer_array", + "IntervalArray", + "PandasArray", + "PandasDtype", + "PeriodArray", + "period_array", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py new file mode 100644 index 0000000000000..e0d33bebeb421 --- /dev/null +++ b/pandas/core/arrays/_arrow_utils.py @@ -0,0 +1,124 @@ +from distutils.version import LooseVersion +import json + +import numpy as np +import pyarrow + +from pandas.core.arrays.interval import _VALID_CLOSED + +_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") + + +def pyarrow_array_to_numpy_and_mask(arr, dtype): + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask + + +if _pyarrow_version_ge_015: + # the pyarrow extension types are only available for pyarrow 0.15+ + + class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + # register the type with a dummy instance + _period_type = ArrowPeriodType("D") + pyarrow.register_extension_type(_period_type) + + class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + # register the type with a dummy instance + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 15ff1432f16e2..20e4cf70eddcf 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -114,10 +114,7 @@ def _generate_range_overflow_safe( assert side in ["start", "end"] i64max = np.uint64(np.iinfo(np.int64).max) - msg = ( - "Cannot generate range with {side}={endpoint} and " - "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) - ) + msg = f"Cannot generate range with {side}={endpoint} and periods={periods}" with np.errstate(over="raise"): # if periods * strides cannot be multiplied within the *uint64* bounds, @@ -182,7 +179,6 @@ def _generate_range_overflow_safe_signed( # watch out for very special case in which we just slightly # exceed implementation bounds, but when passing the result to # np.arange will get a result slightly within the bounds - assert endpoint >= 0 result = np.uint64(endpoint) + np.uint64(addend) i64max = np.uint64(np.iinfo(np.int64).max) assert result > i64max @@ -190,7 +186,5 @@ def _generate_range_overflow_safe_signed( return result raise OutOfBoundsDatetime( - "Cannot generate range with " - "{side}={endpoint} and " - "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) + f"Cannot generate range with {side}={endpoint} and periods={periods}" ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 08901df963f20..9723343ea7af5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,6 +10,8 @@ import numpy as np +from pandas._libs import lib +from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -21,15 +23,35 @@ from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna -from pandas._typing import ArrayLike from pandas.core import ops from pandas.core.algorithms import _factorize_array, unique from pandas.core.missing import backfill_1d, pad_1d from pandas.core.sorting import nargsort -_not_implemented_message = "{} does not implement {}." +_extension_array_shared_docs: Dict[str, str] = dict() + + +def try_cast_to_ea(cls_or_instance, obj, dtype=None): + """ + Call to `_from_sequence` that returns the object unchanged on Exception. -_extension_array_shared_docs = dict() # type: Dict[str, str] + Parameters + ---------- + cls_or_instance : ExtensionArray subclass or instance + obj : arraylike + Values to pass to cls._from_sequence + dtype : ExtensionDtype, optional + + Returns + ------- + ExtensionArray or obj + """ + try: + result = cls_or_instance._from_sequence(obj, dtype=dtype) + except Exception: + # We can't predict what downstream EA constructors may raise + result = obj + return result class ExtensionArray: @@ -307,9 +329,7 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: # __init__ method coerces that value, then so should __setitem__ # Note, also, that Series/DataFrame.where internally use __setitem__ # on a copy of the data. - raise NotImplementedError( - _not_implemented_message.format(type(self), "__setitem__") - ) + raise NotImplementedError(f"{type(self)} does not implement __setitem__.") def __len__(self) -> int: """ @@ -331,6 +351,39 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ @@ -428,7 +481,9 @@ def _values_for_argsort(self) -> np.ndarray: # Note: this is used in `ExtensionArray.argsort`. return np.array(self) - def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): + def argsort( + self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs + ) -> np.ndarray: """ Return the indices that would sort this array. @@ -444,7 +499,7 @@ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): Returns ------- - index_array : ndarray + ndarray Array of indices that sort ``self``. If NaN values are contained, NaN values are placed at the end. @@ -495,8 +550,8 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): raise ValueError( - "Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self)) + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" ) value = value[mask] @@ -667,11 +722,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra Parameters ---------- na_sentinel : int, default -1 - Value to use in the `labels` array to indicate missing values. + Value to use in the `codes` array to indicate missing values. Returns ------- - labels : ndarray + codes : ndarray An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray @@ -701,12 +756,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra # Complete control over factorization. arr, na_value = self._values_for_factorize() - labels, uniques = _factorize_array( + codes, uniques = _factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) uniques = self._from_factorized(uniques, self) - return labels, uniques + return codes, uniques _extension_array_shared_docs[ "repeat" @@ -890,20 +945,17 @@ def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: # Printing # ------------------------------------------------------------------------ - def __repr__(self): + def __repr__(self) -> str: from pandas.io.formats.printing import format_object_summary - template = "{class_name}{data}\nLength: {length}, dtype: {dtype}" # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary data = format_object_summary( self, self._formatter(), indent_for_name=False ).rstrip(", \n") - class_name = "<{}>\n".format(self.__class__.__name__) - return template.format( - class_name=class_name, data=data, length=len(self), dtype=self.dtype - ) + class_name = f"<{type(self).__name__}>\n" + return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: """Formatting function for scalar values. @@ -1019,11 +1071,7 @@ def _reduce(self, name, skipna=True, **kwargs): ------ TypeError : subclass does not define reductions """ - raise TypeError( - "cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype - ) - ) + raise TypeError(f"cannot perform {name} with type {self.dtype}") class ExtensionOpsMixin: @@ -1065,6 +1113,15 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + @classmethod + def _add_logical_ops(cls): + cls.__and__ = cls._create_logical_method(operator.and_) + cls.__rand__ = cls._create_logical_method(ops.rand_) + cls.__or__ = cls._create_logical_method(operator.or_) + cls.__ror__ = cls._create_logical_method(ops.ror_) + cls.__xor__ = cls._create_logical_method(operator.xor) + cls.__rxor__ = cls._create_logical_method(ops.rxor) + class ExtensionScalarOpsMixin(ExtensionOpsMixin): """ @@ -1156,9 +1213,9 @@ def _maybe_convert(arr): # https://github.com/pandas-dev/pandas/issues/22850 # We catch all regular exceptions here, and fall back # to an ndarray. - try: - res = self._from_sequence(arr) - except Exception: + res = try_cast_to_ea(self, arr) + if not isinstance(res, type(self)): + # exception raised in _from_sequence; ensure we have ndarray res = np.asarray(arr) else: res = np.asarray(arr) @@ -1166,10 +1223,9 @@ def _maybe_convert(arr): if op.__name__ in {"divmod", "rdivmod"}: a, b = zip(*res) - res = _maybe_convert(a), _maybe_convert(b) - else: - res = _maybe_convert(res) - return res + return _maybe_convert(a), _maybe_convert(b) + + return _maybe_convert(res) op_name = ops._get_op_name(op, True) return set_function_name(_binop, op_name, cls) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py new file mode 100644 index 0000000000000..fa1cbc87cc5c1 --- /dev/null +++ b/pandas/core/arrays/boolean.py @@ -0,0 +1,770 @@ +import numbers +from typing import TYPE_CHECKING, Any, Tuple, Type +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna, notna + +from pandas.core import nanops, ops + +from .masked import BaseMaskedArray + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +@register_extension_dtype +class BooleanDtype(ExtensionDtype): + """ + Extension dtype for boolean data. + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> pd.BooleanDtype() + BooleanDtype + """ + + name = "boolean" + + @property + def na_value(self) -> "Scalar": + """ + BooleanDtype uses :attr:`pandas.NA` as the missing NA value. + + .. warning:: + + `na_value` may change in a future release. + """ + return libmissing.NA + + @property + def type(self) -> Type: + return np.bool_ + + @property + def kind(self) -> str: + return "b" + + @classmethod + def construct_array_type(cls) -> "Type[BooleanArray]": + return BooleanArray + + def __repr__(self) -> str: + return "BooleanDtype" + + @property + def _is_boolean(self) -> bool: + return True + + def __from_arrow__(self, array): + """Construct BooleanArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # TODO should optimize this without going through object array + bool_arr = BooleanArray._from_sequence(np.array(arr)) + results.append(bool_arr) + + return BooleanArray._concat_same_type(results) + + +def coerce_to_array(values, mask=None, copy: bool = False): + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + if isinstance(values, BooleanArray): + if mask is not None: + raise ValueError("cannot pass mask for BooleanArray input") + values, mask = values._data, values._mask + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + mask_values = None + if isinstance(values, np.ndarray) and values.dtype == np.bool_: + if copy: + values = values.copy() + elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): + mask_values = isna(values) + + values_bool = np.zeros(len(values), dtype=bool) + values_bool[~mask_values] = values[~mask_values].astype(bool) + + if not np.all( + values_bool[~mask_values].astype(values.dtype) == values[~mask_values] + ): + raise TypeError("Need to pass bool-like values") + + values = values_bool + else: + values_object = np.asarray(values, dtype=object) + + inferred_dtype = lib.infer_dtype(values_object, skipna=True) + integer_like = ("floating", "integer", "mixed-integer-float") + if inferred_dtype not in ("boolean", "empty") + integer_like: + raise TypeError("Need to pass bool-like values") + + mask_values = isna(values_object) + values = np.zeros(len(values), dtype=bool) + values[~mask_values] = values_object[~mask_values].astype(bool) + + # if the values were integer-like, validate it were actually 0/1's + if inferred_dtype in integer_like: + if not np.all( + values[~mask_values].astype(float) + == values_object[~mask_values].astype(float) + ): + raise TypeError("Need to pass bool-like values") + + if mask is None and mask_values is None: + mask = np.zeros(len(values), dtype=bool) + elif mask is None: + mask = mask_values + else: + if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: + if mask_values is not None: + mask = mask | mask_values + else: + if copy: + mask = mask.copy() + else: + mask = np.array(mask, dtype=bool) + if mask_values is not None: + mask = mask | mask_values + + if not values.ndim == 1: + raise ValueError("values must be a 1D list-like") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D list-like") + + return values, mask + + +class BooleanArray(BaseMaskedArray): + """ + Array of boolean (True/False) data with missing values. + + This is a pandas Extension array for boolean data, under the hood + represented by 2 numpy arrays: a boolean array with the data and + a boolean array with the mask (True indicating missing). + + BooleanArray implements Kleene logic (sometimes called three-value + logic) for logical operations. See :ref:`boolean.kleene` for more. + + To construct an BooleanArray from generic array-like input, use + :func:`pandas.array` specifying ``dtype="boolean"`` (see examples + below). + + .. versionadded:: 1.0.0 + + .. warning:: + + BooleanArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : numpy.ndarray + A 1-d boolean-dtype array with the data. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values (True + indicates missing). + copy : bool, default False + Whether to copy the `values` and `mask` arrays. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + BooleanArray + + Examples + -------- + Create an BooleanArray with :func:`pandas.array`: + + >>> pd.array([True, False, None], dtype="boolean") + + [True, False, ] + Length: 3, dtype: boolean + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): + raise TypeError( + "values should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'array' function instead" + ) + if not values.ndim == 1: + raise ValueError("values must be a 1D array") + if not mask.ndim == 1: + raise ValueError("mask must be a 1D array") + + if copy: + values = values.copy() + mask = mask.copy() + + self._data = values + self._mask = mask + self._dtype = BooleanDtype() + + @property + def dtype(self): + return self._dtype + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False): + if dtype: + assert dtype == "boolean" + values, mask = coerce_to_array(scalars, copy=copy) + return BooleanArray(values, mask) + + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + data = self._data.astype("int8") + data[self._mask] = -1 + return data, -1 + + @classmethod + def _from_factorized(cls, values, original: "BooleanArray"): + return cls._from_sequence(values, dtype=original.dtype) + + _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # For BooleanArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (BooleanArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, BooleanArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + if is_bool_dtype(x.dtype): + m = mask.copy() + return BooleanArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def __setitem__(self, key, value): + _is_scalar = is_scalar(value) + if _is_scalar: + value = [value] + value, mask = coerce_to_array(value) + + if _is_scalar: + value = value[0] + mask = mask[0] + + self._data[key] = value + self._mask[key] = mask + + def astype(self, dtype, copy=True): + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray or ExtensionArray + NumPy ndarray, BooleanArray or IntergerArray with 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an BooleanDtype, equivalent of same_kind + casting + """ + dtype = pandas_dtype(dtype) + + if isinstance(dtype, BooleanDtype): + values, mask = coerce_to_array(self, copy=copy) + return BooleanArray(values, mask, copy=False) + + if is_bool_dtype(dtype): + # astype_nansafe converts np.nan to True + if self._hasna: + raise ValueError("cannot convert float NaN to bool") + else: + return self._data.astype(dtype, copy=copy) + if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): + from pandas.core.arrays import IntegerArray + + return IntegerArray( + self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False + ) + # for integer, error if there are missing values + if is_integer_dtype(dtype): + if self._hasna: + raise ValueError("cannot convert NA to integer") + # for float dtype, ensure we use np.nan before casting (numpy cannot + # deal with pd.NA) + na_value = self._na_value + if is_float_dtype(dtype): + na_value = np.nan + # coerce + data = self.to_numpy(na_value=na_value) + return astype_nansafe(data, dtype, copy=False) + + def _values_for_argsort(self) -> np.ndarray: + """ + Return values for sorting. + + Returns + ------- + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort + """ + data = self._data.copy() + data[self._mask] = -1 + return data + + def any(self, skipna: bool = True, **kwargs): + """ + Return whether any element is True. + + Returns False unless there is at least one element that is True. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be False, as for an empty array. + If `skipna` is False, the result will still be True if there is + at least one element that is True, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.any : Numpy version of this method. + BooleanArray.all : Return whether all elements are True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, False, True]).any() + True + >>> pd.array([True, False, pd.NA]).any() + True + >>> pd.array([False, False, pd.NA]).any() + False + >>> pd.array([], dtype="boolean").any() + False + >>> pd.array([pd.NA], dtype="boolean").any() + False + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, False, pd.NA]).any(skipna=False) + True + >>> pd.array([False, False, pd.NA]).any(skipna=False) + + """ + kwargs.pop("axis", None) + nv.validate_any((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, False) + result = values.any() + if skipna: + return result + else: + if result or len(self) == 0: + return result + else: + return self.dtype.na_value + + def all(self, skipna: bool = True, **kwargs): + """ + Return whether all elements are True. + + Returns True unless there is at least one element that is False. + By default, NAs are skipped. If ``skipna=False`` is specified and + missing values are present, similar :ref:`Kleene logic ` + is used as for logical operations. + + Parameters + ---------- + skipna : bool, default True + Exclude NA values. If the entire array is NA and `skipna` is + True, then the result will be True, as for an empty array. + If `skipna` is False, the result will still be False if there is + at least one element that is False, otherwise NA will be returned + if there are NA's present. + **kwargs : any, default None + Additional keywords have no effect but might be accepted for + compatibility with NumPy. + + Returns + ------- + bool or :attr:`pandas.NA` + + See Also + -------- + numpy.all : Numpy version of this method. + BooleanArray.any : Return whether any element is True. + + Examples + -------- + + The result indicates whether any element is True (and by default + skips NAs): + + >>> pd.array([True, True, pd.NA]).all() + True + >>> pd.array([True, False, pd.NA]).all() + False + >>> pd.array([], dtype="boolean").all() + True + >>> pd.array([pd.NA], dtype="boolean").all() + True + + With ``skipna=False``, the result can be NA if this is logically + required (whether ``pd.NA`` is True or False influences the result): + + >>> pd.array([True, True, pd.NA]).all(skipna=False) + + >>> pd.array([True, False, pd.NA]).all(skipna=False) + False + """ + kwargs.pop("axis", None) + nv.validate_all((), kwargs) + + values = self._data.copy() + np.putmask(values, self._mask, True) + result = values.all() + + if skipna: + return result + else: + if not result or len(self) == 0: + return result + else: + return self.dtype.na_value + + @classmethod + def _create_logical_method(cls, op): + def logical_method(self, other): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} + other = lib.item_from_zerodim(other) + other_is_booleanarray = isinstance(other, BooleanArray) + other_is_scalar = lib.is_scalar(other) + mask = None + + if other_is_booleanarray: + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() + + if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + raise TypeError( + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." + ) + + if not other_is_scalar and len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + elif op.__name__ in {"xor", "rxor"}: + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) + + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(logical_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + from pandas.arrays import IntegerArray + + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros_like(self._data) + mask = np.ones_like(self._data) + else: + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask, copy=False) + + name = f"__{op.__name__}" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name, skipna=True, **kwargs): + + if name in {"any", "all"}: + return getattr(self, name)(skipna=skipna, **kwargs) + + data = self._data + mask = self._mask + + # coerce to a nan-aware float if needed + if mask.any(): + data = self._data.astype("float64") + data[mask] = np.nan + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + # if we have numeric op that would result in an int, coerce to int if possible + if name in ["sum", "prod"] and notna(result): + int_result = np.int64(result) + if int_result == result: + result = int_result + + elif name in ["min", "max"] and notna(result): + result = np.bool_(result) + + return result + + def _maybe_mask_result(self, result, mask, other, op_name): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv"] + ): + result[mask] = np.nan + return result + + if is_bool_dtype(result): + return BooleanArray(result, mask, copy=False) + + elif is_integer_dtype(result): + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask, copy=False) + else: + result[mask] = np.nan + return result + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + def boolean_arithmetic_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + else: + mask = self._mask | mask + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op_name}__" + return set_function_name(boolean_arithmetic_method, name, cls) + + +BooleanArray._add_logical_ops() +BooleanArray._add_comparison_ops() +BooleanArray._add_arithmetic_ops() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c50870563df28..2806635211459 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,14 +1,14 @@ import operator from shutil import get_terminal_size -import textwrap -from typing import Type, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np from pandas._config import get_option -from pandas._libs import algos as libalgos, hashtable as htable, lib +from pandas._libs import algos as libalgos, hashtable as htable +from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -25,11 +25,9 @@ ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, - is_datetimelike, is_dict_like, is_dtype_equal, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_iterator, is_list_like, @@ -37,53 +35,38 @@ is_scalar, is_sequence, is_timedelta64_dtype, + needs_i8_conversion, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like, is_hashable from pandas.core.dtypes.missing import isna, notna -from pandas._typing import ArrayLike, Dtype, Ordered from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays.base import ( + ExtensionArray, + _extension_array_shared_docs, + try_cast_to_ea, +) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import interpolate_2d +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.io.formats import console -from .base import ExtensionArray, _extension_array_shared_docs - -_take_msg = textwrap.dedent( - """\ - Interpreting negative values in 'indexer' as missing values. - In the future, this will change to meaning positional indices - from the right. - - Use 'allow_fill=True' to retain the previous behavior and silence this - warning. - - Use 'allow_fill=False' to accept the new behavior.""" -) - def _cat_compare_op(op): - opname = "__{op}__".format(op=op.__name__) - - def f(self, other): - # On python2, you can usually compare any type to any type, and - # Categoricals can be seen as a custom type, but having different - # results depending whether categories are the same or not is kind of - # insane, so be a bit stricter here and use the python3 idea of - # comparing only things of equal type. - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - other = lib.item_from_zerodim(other) + opname = f"__{op.__name__}__" + + @unpack_zerodim_and_defer(opname) + def func(self, other): if is_list_like(other) and len(other) != len(self): # TODO: Could this fail if the categories are listlike objects? raise ValueError("Lengths must match.") @@ -115,9 +98,9 @@ def f(self, other): else: other_codes = other._codes - mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, opname) ret = f(other_codes) + mask = (self._codes == -1) | (other_codes == -1) if mask.any(): # In other series, the leads to False, so do that here too ret[mask] = False @@ -128,21 +111,21 @@ def f(self, other): i = self.categories.get_loc(other) ret = getattr(self._codes, opname)(i) - # check for NaN in self - mask = self._codes == -1 - ret[mask] = False + if opname not in {"__eq__", "__ge__", "__gt__"}: + # check for NaN needed if we are not equal or larger + mask = self._codes == -1 + ret[mask] = False return ret else: if opname == "__eq__": - return np.repeat(False, len(self)) + return np.zeros(len(self), dtype=bool) elif opname == "__ne__": - return np.repeat(True, len(self)) + return np.ones(len(self), dtype=bool) else: - msg = ( - "Cannot compare a Categorical for op {op} with a " + raise TypeError( + f"Cannot compare a Categorical for op {opname} with a " "scalar, which is not a category." ) - raise TypeError(msg.format(op=opname)) else: # allow categorical vs object dtype array comparisons for equality @@ -150,16 +133,15 @@ def f(self, other): if opname in ["__eq__", "__ne__"]: return getattr(np.array(self), opname)(np.array(other)) - msg = ( - "Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'." + raise TypeError( + f"Cannot compare a Categorical for op {opname} with " + f"type {type(other)}.\nIf you want to compare values, " + "use 'np.asarray(cat) other'." ) - raise TypeError(msg.format(op=opname, typ=type(other))) - f.__name__ = opname + func.__name__ = opname - return f + return func def contains(cat, key, container): @@ -254,7 +236,7 @@ class Categorical(ExtensionArray, PandasObject): `categories` attribute (which in turn is the `categories` argument, if provided). dtype : CategoricalDtype - An instance of ``CategoricalDtype`` to use for this categorical + An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.21.0 @@ -294,7 +276,7 @@ class Categorical(ExtensionArray, PandasObject): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -324,9 +306,7 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset( - ["tolist", "itemsize", "get_values"] - ) + _deprecations = PandasObject._deprecations | frozenset(["tolist"]) _typ = "categorical" def __init__( @@ -352,7 +332,7 @@ def __init__( # sanitize input if is_categorical_dtype(values): if dtype.categories is None: - dtype = CategoricalDtype(values.categories, dtype._ordered) + dtype = CategoricalDtype(values.categories, dtype.ordered) elif not isinstance(values, (ABCIndexClass, ABCSeries)): # sanitize_array coerces np.nan to a string under certain versions # of numpy @@ -375,7 +355,7 @@ def __init__( codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) - if dtype._ordered: + if dtype.ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories raise TypeError( @@ -391,7 +371,7 @@ def __init__( ) # we're inferring from values - dtype = CategoricalDtype(categories, dtype._ordered) + dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values): old_codes = ( @@ -461,7 +441,7 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. """ - return self.dtype._ordered + return self.dtype.ordered @property def dtype(self) -> CategoricalDtype: @@ -518,14 +498,13 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: if is_extension_array_dtype(dtype): return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 if is_integer_dtype(dtype) and self.isna().any(): - msg = "Cannot convert float NaN to integer" - raise ValueError(msg) + raise ValueError("Cannot convert float NaN to integer") return np.array(self, dtype=dtype, copy=copy) @cache_readonly def size(self) -> int: """ - return the len of myself + Return the len of myself. """ return self._codes.size @@ -536,7 +515,7 @@ def itemsize(self) -> int: """ return self.categories.itemsize - def tolist(self) -> list: + def tolist(self) -> List[Scalar]: """ Return a list of the values. @@ -548,13 +527,6 @@ def tolist(self) -> list: to_list = tolist - @property - def base(self) -> None: - """ - compat, we are always our own object - """ - return None - @classmethod def _from_inferred_categories( cls, inferred_categories, inferred_codes, dtype, true_values=None @@ -673,23 +645,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): raise ValueError(msg) codes = np.asarray(codes) # #21767 - if not is_integer_dtype(codes): - msg = "codes need to be array-like integers" - if is_float_dtype(codes): - icodes = codes.astype("i8") - if (icodes == codes).all(): - msg = None - codes = icodes - warn( - ( - "float codes will be disallowed in the future and " - "raise a ValueError" - ), - FutureWarning, - stacklevel=2, - ) - if msg: - raise ValueError(msg) + if len(codes) and not is_integer_dtype(codes): + raise ValueError("codes need to be array-like integers") if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and len(categories)-1") @@ -843,8 +800,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes, for example when using special string - dtypes on python3, which does not considers a S1 string equal to a - single char python string. + dtypes, which does not considers a S1 string equal to a single char + python string. Parameters ---------- @@ -879,7 +836,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal """ inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: - ordered = self.dtype._ordered + ordered = self.dtype.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() @@ -1061,11 +1018,9 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ( - "new categories must not include old categories: " - "{already_included!s}" + raise ValueError( + f"new categories must not include old categories: {already_included}" ) - raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -1111,7 +1066,7 @@ def remove_categories(self, removals, inplace=False): if not is_list_like(removals): removals = [removals] - removal_set = set(list(removals)) + removal_set = set(removals) not_included = removal_set - set(self.dtype.categories) new_categories = [c for c in self.dtype.categories if c not in removal_set] @@ -1121,8 +1076,7 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: - msg = "removals must all be in old categories: {not_included!s}" - raise ValueError(msg.format(not_included=not_included)) + raise ValueError(f"removals must all be in old categories: {not_included}") return self.set_categories( new_categories, ordered=self.ordered, rename=False, inplace=inplace @@ -1300,9 +1254,8 @@ def shift(self, periods, fill_value=None): fill_value = self.categories.get_loc(fill_value) else: raise ValueError( - "'fill_value={}' is not present " - "in this Categorical's " - "categories".format(fill_value) + f"'fill_value={fill_value}' is not present " + "in this Categorical's categories" ) if periods > 0: codes[:periods] = fill_value @@ -1311,7 +1264,7 @@ def shift(self, periods, fill_value=None): return self.from_codes(codes, dtype=self.dtype) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1343,8 +1296,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) raise TypeError( - "Object with dtype {dtype} cannot perform " - "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__) + f"Object with dtype {self.dtype} cannot perform " + f"the numpy op {ufunc.__name__}" ) def __setstate__(self, state): @@ -1509,31 +1462,20 @@ def value_counts(self, dropna=True): return Series(count, index=CategoricalIndex(ix), dtype="int64") - def get_values(self): + def _internal_get_values(self): """ Return the values. - .. deprecated:: 0.25.0 - For internal compatibility with pandas formatting. Returns ------- - numpy.array + np.ndarray or Index A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods. """ - warn( - "The 'get_values' method is deprecated and will be removed in a " - "future version", - FutureWarning, - stacklevel=2, - ) - return self._internal_get_values() - - def _internal_get_values(self): # if we are a datetime and period index, return Index to keep metadata - if is_datetimelike(self.categories): + if needs_i8_conversion(self.categories): return self.categories.take(self._codes, fill_value=np.nan) elif is_integer_dtype(self.categories) and -1 in self._codes: return self.categories.astype("object").take(self._codes, fill_value=np.nan) @@ -1543,9 +1485,9 @@ def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: raise TypeError( - "Categorical is not ordered for operation {op}\n" + f"Categorical is not ordered for operation {op}\n" "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op) + "Categorical to an ordered one\n" ) def _values_for_argsort(self): @@ -1680,8 +1622,7 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): """ inplace = validate_bool_kwarg(inplace, "inplace") if na_position not in ["last", "first"]: - msg = "invalid na_position: {na_position!r}" - raise ValueError(msg.format(na_position=na_position)) + raise ValueError(f"invalid na_position: {repr(na_position)}") sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) @@ -1722,24 +1663,6 @@ def _values_for_rank(self): ) return values - def ravel(self, order="C"): - """ - Return a flattened (numpy) array. - - For internal compatibility with numpy arrays. - - Returns - ------- - numpy.array - """ - warn( - "Categorical.ravel will return a Categorical object instead " - "of an ndarray in a future version.", - FutureWarning, - stacklevel=2, - ) - return np.array(self) - def view(self, dtype=None): if dtype is not None: raise NotImplementedError(dtype) @@ -1757,7 +1680,6 @@ def to_dense(self): """ return np.asarray(self) - @deprecate_kwarg(old_arg_name="fill_value", new_arg_name="value") def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -1836,14 +1758,13 @@ def fillna(self, value=None, method=None, limit=None): else: raise TypeError( - '"value" parameter must be a scalar, dict ' - "or Series, but you passed a " - '"{0}"'.format(type(value).__name__) + f"'value' parameter must be a scalar, dict " + f"or Series, but you passed a {type(value).__name__}" ) return self._constructor(codes, dtype=self.dtype, fastpath=True) - def take_nd(self, indexer, allow_fill=None, fill_value=None): + def take(self, indexer, allow_fill: bool = False, fill_value=None): """ Take elements from the Categorical. @@ -1852,7 +1773,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): indexer : sequence of int The indices in `self` to take. The meaning of negative values in `indexer` depends on the value of `allow_fill`. - allow_fill : bool, default None + allow_fill : bool, default False How to handle negative values in `indexer`. * False: negative values in `indices` indicate positional indices @@ -1863,11 +1784,9 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): (the default). These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. - .. versionchanged:: 0.23.0 + .. versionchanged:: 1.0.0 - Deprecated the default value of `allow_fill`. The deprecated - default is ``True``. In the future, this will change to - ``False``. + Default value changed from ``True`` to ``False``. fill_value : object The value to use for `indices` that are missing (-1), when @@ -1917,10 +1836,6 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): will raise a ``TypeError``. """ indexer = np.asarray(indexer, dtype=np.intp) - if allow_fill is None: - if (indexer < 0).any(): - warn(_take_msg, FutureWarning, stacklevel=2) - allow_fill = True dtype = self.dtype @@ -1931,16 +1846,26 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - msg = "'fill_value' ('{}') is not in this Categorical's categories." - raise TypeError(msg.format(fill_value)) + msg = ( + f"'fill_value' ('{fill_value}') is not in this " + "Categorical's categories." + ) + raise TypeError(msg) codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = type(self).from_codes(codes, dtype=dtype) return result - take = take_nd + def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): + # GH#27745 deprecate alias that other EAs dont have + warn( + "Categorical.take_nd is deprecated, use Categorical.take instead", + FutureWarning, + stacklevel=2, + ) + return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) - def __len__(self): + def __len__(self) -> int: """ The length of this Categorical. """ @@ -1952,7 +1877,7 @@ def __iter__(self): """ return iter(self._internal_get_values().tolist()) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ @@ -1962,7 +1887,7 @@ def __contains__(self, key): return contains(self, key, container=self._codes) - def _tidy_repr(self, max_vals=10, footer=True): + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ @@ -1970,11 +1895,9 @@ def _tidy_repr(self, max_vals=10, footer=True): head = self[:num]._get_repr(length=False, footer=False) tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:]) + result = f"{head[:-1]}, ..., {tail[1:]}" if footer: - result = "{result}\n{footer}".format( - result=result, footer=self._repr_footer() - ) + result = f"{result}\n{self._repr_footer()}" return str(result) @@ -2001,16 +1924,14 @@ def _repr_categories(self): category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self): + def _repr_categories_info(self) -> str: """ Returns a string representation of the footer. """ category_strs = self._repr_categories() dtype = str(self.categories.dtype) - levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype - ) + levheader = f"Categories ({len(self.categories)}, {dtype}): " width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): @@ -2033,13 +1954,11 @@ def _repr_categories_info(self): # replace to simple save space by return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" - def _repr_footer(self): - - return "Length: {length}\n{info}".format( - length=len(self), info=self._repr_categories_info() - ) + def _repr_footer(self) -> str: + info = self._repr_categories_info() + return f"Length: {len(self)}\n{info}" - def _get_repr(self, length=True, na_rep="NaN", footer=True): + def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: from pandas.io.formats import format as fmt formatter = fmt.CategoricalFormatter( @@ -2048,7 +1967,7 @@ def _get_repr(self, length=True, na_rep="NaN", footer=True): result = formatter.to_string() return str(result) - def __repr__(self): + def __repr__(self) -> str: """ String representation. """ @@ -2059,7 +1978,7 @@ def __repr__(self): result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = "[], {repr_msg}".format(repr_msg=msg) + result = f"[], {msg}" return result @@ -2081,10 +2000,17 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] - else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + + if is_list_like(key) and not is_array_like(key): + key = np.asarray(key) + + if com.is_bool_indexer(key): + key = check_bool_array_indexer(self, key) + + result = self._codes[key] + if result.ndim > 1: + return result + return self._constructor(result, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ @@ -2152,7 +2078,7 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer - def _reverse_indexer(self): + def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. @@ -2182,24 +2108,28 @@ def _reverse_indexer(self): self.codes.astype("int64"), categories.size ) counts = counts.cumsum() - result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, result)) + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(categories, _result)) return result # reduction ops # def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: - msg = "Categorical cannot perform the operation {op}" - raise TypeError(msg.format(op=name)) + raise TypeError(f"Categorical cannot perform the operation {name}") return func(**kwargs) - def min(self, numeric_only=None, **kwargs): + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def min(self, skipna=True): """ The minimum value of the object. Only ordered `Categoricals` have a minimum! + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + Raises ------ TypeError @@ -2210,22 +2140,31 @@ def min(self, numeric_only=None, **kwargs): min : the minimum of this `Categorical` """ self.check_for_ordered("min") - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].min(**kwargs) - else: - pointer = self._codes.min(**kwargs) - if pointer == -1: - return np.nan + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].min() + else: + return np.nan else: - return self.categories[pointer] + pointer = self._codes.min() + return self.categories[pointer] - def max(self, numeric_only=None, **kwargs): + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def max(self, skipna=True): """ The maximum value of the object. Only ordered `Categoricals` have a maximum! + .. versionchanged:: 1.0.0 + + Returns an NA value on empty arrays + Raises ------ TypeError @@ -2236,15 +2175,19 @@ def max(self, numeric_only=None, **kwargs): max : the maximum of this `Categorical` """ self.check_for_ordered("max") - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].max(**kwargs) - else: - pointer = self._codes.max(**kwargs) - if pointer == -1: - return np.nan + + if not len(self._codes): + return self.dtype.na_value + + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].max() + else: + return np.nan else: - return self.categories[pointer] + pointer = self._codes.max() + return self.categories[pointer] def mode(self, dropna=True): """ @@ -2459,11 +2402,10 @@ def isin(self, values): array([ True, False, True, False, True, False]) """ if not is_list_like(values): + values_type = type(values).__name__ raise TypeError( "only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]".format( - values_type=type(values).__name__ - ) + f" to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) @@ -2471,6 +2413,51 @@ def isin(self, values): code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) + def replace(self, to_replace, value, inplace: bool = False): + """ + Replaces all instances of one value with another + + Parameters + ---------- + to_replace: object + The value to be replaced + + value: object + The value to replace it with + + inplace: bool + Whether the operation is done in-place + + Returns + ------- + None if inplace is True, otherwise the new Categorical after replacement + + + Examples + -------- + >>> s = pd.Categorical([1, 2, 1, 3]) + >>> s.replace(1, 3) + [3, 3, 2, 3] + Categories (2, int64): [2, 3] + """ + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + if to_replace in cat.categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + categories = cat.categories.tolist() + index = categories.index(to_replace) + if value in cat.categories: + value_index = categories.index(value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(to_replace, inplace=True) + else: + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + # The Series.cat accessor @@ -2557,43 +2544,6 @@ def _delegate_method(self, name, *args, **kwargs): if res is not None: return Series(res, index=self._index, name=self._name) - @property - def categorical(self): - # Note: Upon deprecation, `test_tab_completion_with_categorical` will - # need to be updated. `categorical` will need to be removed from - # `ok_for_cat`. - warn( - "`Series.cat.categorical` has been deprecated. Use the " - "attributes on 'Series.cat' directly instead.", - FutureWarning, - stacklevel=2, - ) - return self._parent - - @property - def name(self): - # Note: Upon deprecation, `test_tab_completion_with_categorical` will - # need to be updated. `name` will need to be removed from - # `ok_for_cat`. - warn( - "`Series.cat.name` has been deprecated. Use `Series.name` instead.", - FutureWarning, - stacklevel=2, - ) - return self._name - - @property - def index(self): - # Note: Upon deprecation, `test_tab_completion_with_categorical` will - # need to be updated. `index` will need to be removed from - # ok_for_cat`. - warn( - "`Series.cat.index` has been deprecated. Use `Series.index` instead.", - FutureWarning, - stacklevel=2, - ) - return self._index - # utility routines @@ -2613,10 +2563,10 @@ def _get_codes_for_values(values, categories): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) - try: - values = categories.dtype.construct_array_type()._from_sequence(values) - except Exception: - # but that may fail for any reason, so fall back to object + cls = categories.dtype.construct_array_type() + values = try_cast_to_ea(cls, values) + if not isinstance(values, cls): + # exception raised in _from_sequence values = ensure_object(values) categories = ensure_object(categories) else: @@ -2678,7 +2628,7 @@ def _convert_to_list_like(list_like): return [list_like] -def _factorize_from_iterable(values): +def factorize_from_iterable(values): """ Factorize an input `values` into `categories` and `codes`. Preserves categorical dtype in `categories`. @@ -2716,9 +2666,9 @@ def _factorize_from_iterable(values): return codes, categories -def _factorize_from_iterables(iterables): +def factorize_from_iterables(iterables): """ - A higher-level wrapper over `_factorize_from_iterable`. + A higher-level wrapper over `factorize_from_iterable`. *This is an internal function* @@ -2733,9 +2683,9 @@ def _factorize_from_iterables(iterables): Notes ----- - See `_factorize_from_iterable` for more info. + See `factorize_from_iterable` for more info. """ if len(iterables) == 0: # For consistency, it should return a list of 2 lists. return [[], []] - return map(list, zip(*(_factorize_from_iterable(it) for it in iterables))) + return map(list, zip(*(factorize_from_iterable(it) for it in iterables))) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4b83dd0cfff09..d7cabbabddf95 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -6,10 +6,12 @@ import numpy as np from pandas._libs import NaT, NaTType, Timestamp, algos, iNaT, lib -from pandas._libs.tslibs.c_timestamp import maybe_integer_op_deprecated +from pandas._libs.tslibs.c_timestamp import integer_op_not_supported from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 +from pandas._typing import DatetimeLikeScalar +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution @@ -26,36 +28,105 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_offsetlike, is_period_dtype, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPeriodArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas._typing import DatetimeLikeScalar -from pandas.core import missing, nanops +from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com -from pandas.core.ops.invalid import make_invalid_op +from pandas.core.indexers import check_bool_array_indexer +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick -from .base import ExtensionArray, ExtensionOpsMixin + +def _datetimelike_array_cmp(cls, op): + """ + Wrap comparison operations to convert Timestamp/Timedelta/Period-like to + boxed scalars/arrays. + """ + opname = f"__{op.__name__}__" + nat_result = opname == "__ne__" + + @unpack_zerodim_and_defer(opname) + def wrapper(self, other): + + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + return invalid_comparison(self, other, op) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + self._check_compatible_with(other) + + other_i8 = self._unbox_scalar(other) + + result = op(self.view("i8"), other_i8) + if isna(other): + result.fill(nat_result) + + elif not is_list_like(other): + return invalid_comparison(self, other, op) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + if isinstance(other, list): + # TODO: could use pd.Index to do inference? + other = np.array(other) + + if not isinstance(other, (np.ndarray, type(self))): + return invalid_comparison(self, other, op) + + if is_object_dtype(other): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) + o_mask = isna(other) + + elif not type(self)._is_recognized_dtype(other.dtype): + return invalid_comparison(self, other, op) + + else: + # For PeriodDType this casting is unnecessary + other = type(self)._from_sequence(other) + self._check_compatible_with(other) + + result = op(self.view("i8"), other.view("i8")) + o_mask = other._isnan + + if o_mask.any(): + result[o_mask] = nat_result + + if self._hasnans: + result[self._isnan] = nat_result + + return result + + return set_function_name(wrapper, opname, cls) class AttributesMixin: - _data = None # type: np.ndarray + _data: np.ndarray @classmethod def _simple_new(cls, values, **kwargs): @@ -113,7 +184,7 @@ def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> raise AbstractMethodError(self) def _check_compatible_with( - self, other: Union[Period, Timestamp, Timedelta, NaTType] + self, other: Union[Period, Timestamp, Timedelta, NaTType], setitem: bool = False ) -> None: """ Verify that `self` and `other` are compatible. @@ -127,6 +198,9 @@ def _check_compatible_with( Parameters ---------- other + setitem : bool, default False + For __setitem__ we may have stricter compatiblity resrictions than + for comparisons. Raises ------ @@ -179,7 +253,8 @@ def strftime(self, date_format): 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - return self._format_native_types(date_format=date_format).astype(object) + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) class TimelikeOps: @@ -292,16 +367,19 @@ class TimelikeOps: def _round(self, freq, mode, ambiguous, nonexistent): # round the local times - values = _ensure_datetimelike_to_i8(self) + if is_datetime64tz_dtype(self): + # operate on naive timestamps, then convert back to aware + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + aware = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + return aware + + values = self.view("i8") result = round_nsint64(values, mode, freq) result = self._maybe_mask_results(result, fill_value=NaT) - - dtype = self.dtype - if is_datetime64tz_dtype(self): - dtype = None - return self._ensure_localized( - self._simple_new(result, dtype=dtype), ambiguous, nonexistent - ) + return self._simple_new(result, dtype=self.dtype) @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, ambiguous="raise", nonexistent="raise"): @@ -328,6 +406,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray) _generate_range """ + @property + def ndim(self) -> int: + return self._data.ndim + + @property + def shape(self): + return self._data.shape + + def reshape(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.reshape(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + + def ravel(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.ravel(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + @property def _box_func(self): """ @@ -385,7 +481,7 @@ def _formatter(self, boxed=False): def nbytes(self): return self._data.nbytes - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) @@ -396,7 +492,7 @@ def size(self) -> int: """The number of elements in this array.""" return np.prod(self.shape) - def __len__(self): + def __len__(self) -> int: return len(self._data) def __getitem__(self, key): @@ -416,10 +512,13 @@ def __getitem__(self, key): getitem = self._data.__getitem__ if is_int: val = getitem(key) - return self._box_func(val) + if lib.is_scalar(val): + # i.e. self.ndim == 1 + return self._box_func(val) + return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: @@ -444,8 +543,6 @@ def __getitem__(self, key): if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition - if is_period: - return self._simple_new(result, dtype=self.dtype, freq=freq) return result return self._simple_new(result, dtype=self.dtype, freq=freq) @@ -473,29 +570,28 @@ def __setitem__( key = cast(Sequence, key) if len(key) != len(value) and not com.is_bool_indexer(key): msg = ( - "shape mismatch: value array of length '{}' does " - "not match indexing result of length '{}'." + f"shape mismatch: value array of length '{len(key)}' " + "does not match indexing result of length " + f"'{len(value)}'." ) - raise ValueError(msg.format(len(key), len(value))) + raise ValueError(msg) elif not len(key): return value = type(self)._from_sequence(value, dtype=self.dtype) - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = value.asi8 elif isinstance(value, self._scalar_type): - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = self._unbox_scalar(value) elif is_valid_nat_for_dtype(value, self.dtype): value = iNaT else: msg = ( - "'value' should be a '{scalar}', 'NaT', or array of those. " - "Got '{typ}' instead." - ) - raise TypeError( - msg.format(scalar=self._scalar_type.__name__, typ=type(value).__name__) + f"'value' should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." ) + raise TypeError(msg) self._data[key] = value self._maybe_clear_freq() @@ -535,8 +631,8 @@ def astype(self, dtype, copy=True): ) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float - msg = "Cannot cast {name} to dtype {dtype}" - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) else: @@ -571,7 +667,17 @@ def _validate_fill_value(self, fill_value): ------ ValueError """ - raise AbstractMethodError(self) + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, self._recognized_scalars): + self._check_compatible_with(fill_value) + fill_value = self._scalar_type(fill_value) + fill_value = self._unbox_scalar(fill_value) + else: + raise ValueError( + f"'fill_value' should be a {self._scalar_type}. Got '{fill_value}'." + ) + return fill_value def take(self, indices, allow_fill=False, fill_value=None): if allow_fill: @@ -640,9 +746,7 @@ def searchsorted(self, value, side="left", sorter=None): value = self._scalar_from_string(value) if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): - raise ValueError( - "Unexpected type for 'value': {valtype}".format(valtype=type(value)) - ) + raise ValueError(f"Unexpected type for 'value': {type(value)}") self._check_compatible_with(value) if isinstance(value, type(self)): @@ -762,8 +866,8 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): raise ValueError( - "Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self)) + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" ) value = value[mask] @@ -829,6 +933,8 @@ def inferred_freq(self): generated by infer_freq. Returns None if it can't autodetect the frequency. """ + if self.ndim != 1: + return None try: return frequencies.infer_freq(self) except ValueError: @@ -883,10 +989,8 @@ def _validate_frequency(cls, index, freq, **kwargs): # raise a ValueError, which we re-raise with a more targeted # message. raise ValueError( - "Inferred frequency {infer} from passed values " - "does not conform to passed frequency {passed}".format( - infer=inferred, passed=freq.freqstr - ) + f"Inferred frequency {inferred} from passed values " + f"does not conform to passed frequency {freq.freqstr}" ) # monotonicity/uniqueness properties are called via frequencies.infer_freq, @@ -906,6 +1010,7 @@ def _is_unique(self): # ------------------------------------------------------------------ # Arithmetic Methods + _create_comparison_method = classmethod(_datetimelike_array_cmp) # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops @@ -923,29 +1028,21 @@ def _is_unique(self): __rdivmod__ = make_invalid_op("__rdivmod__") def _add_datetimelike_scalar(self, other): - # Overriden by TimedeltaArray - raise TypeError( - "cannot add {cls} and {typ}".format( - cls=type(self).__name__, typ=type(other).__name__ - ) - ) + # Overridden by TimedeltaArray + raise TypeError(f"cannot add {type(self).__name__} and {type(other).__name__}") _add_datetime_arraylike = _add_datetimelike_scalar def _sub_datetimelike_scalar(self, other): # Overridden by DatetimeArray assert other is not NaT - raise TypeError( - "cannot subtract a datelike from a {cls}".format(cls=type(self).__name__) - ) + raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}") _sub_datetime_arraylike = _sub_datetimelike_scalar def _sub_period(self, other): - # Overriden by PeriodArray - raise TypeError( - "cannot subtract Period from a {cls}".format(cls=type(self).__name__) - ) + # Overridden by PeriodArray + raise TypeError(f"cannot subtract Period from a {type(self).__name__}") def _add_offset(self, offset): raise AbstractMethodError(self) @@ -984,7 +1081,7 @@ def _add_timedeltalike_scalar(self, other): """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds - new_values = np.empty(len(self), dtype="i8") + new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT return new_values @@ -1025,14 +1122,12 @@ def _add_nat(self): """ if is_period_dtype(self): raise TypeError( - "Cannot add {cls} and {typ}".format( - cls=type(self).__name__, typ=type(NaT).__name__ - ) + f"Cannot add {type(self).__name__} and {type(NaT).__name__}" ) # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return type(self)(result, dtype=self.dtype, freq=None) @@ -1046,7 +1141,7 @@ def _sub_nat(self): # For datetime64 dtypes by convention we treat NaT as a datetime, so # this subtraction returns a timedelta64 dtype. # For period dtype, timedelta64 is a close-enough return dtype. - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return result.view("timedelta64[ns]") @@ -1067,13 +1162,9 @@ def _sub_period_array(self, other): """ if not is_period_dtype(self): raise TypeError( - "cannot subtract {dtype}-dtype from {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) + f"cannot subtract {other.dtype}-dtype from {type(self).__name__}" ) - if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of unequal length") if self.freq != other.freq: msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr @@ -1090,47 +1181,13 @@ def _sub_period_array(self, other): new_values[mask] = NaT return new_values - def _addsub_int_array(self, other, op): - """ - Add or subtract array-like of integers equivalent to applying - `_time_shift` pointwise. - - Parameters - ---------- - other : Index, ExtensionArray, np.ndarray - integer-dtype - op : {operator.add, operator.sub} - - Returns - ------- - result : same class as self - """ - # _addsub_int_array is overriden by PeriodArray - assert not is_period_dtype(self) - assert op in [operator.add, operator.sub] - - if self.freq is None: - # GH#19123 - raise NullFrequencyError("Cannot shift with no freq") - - elif isinstance(self.freq, Tick): - # easy case where we can convert to timedelta64 operation - td = Timedelta(self.freq) - return op(self, td * other) - - # We should only get here with DatetimeIndex; dispatch - # to _addsub_offset_array - assert not is_timedelta64_dtype(self) - return op(self, np.array(other) * self.freq) - - def _addsub_offset_array(self, other, op): + def _addsub_object_array(self, other: np.ndarray, op): """ Add or subtract array-like of DateOffset objects Parameters ---------- - other : Index, np.ndarray - object-dtype containing pd.DateOffset objects + other : np.ndarray[object] op : {operator.add, operator.sub} Returns @@ -1143,7 +1200,7 @@ def _addsub_offset_array(self, other, op): warnings.warn( "Adding/subtracting array of DateOffsets to " - "{cls} not vectorized".format(cls=type(self).__name__), + f"{type(self).__name__} not vectorized", PerformanceWarning, ) @@ -1154,7 +1211,12 @@ def _addsub_offset_array(self, other, op): kwargs = {} if not is_period_dtype(self): kwargs["freq"] = "infer" - return self._from_sequence(res_values, **kwargs) + try: + res = type(self)._from_sequence(res_values, **kwargs) + except ValueError: + # e.g. we've passed a Timestamp to TimedeltaArray + res = res_values + return res def _time_shift(self, periods, freq=None): """ @@ -1193,13 +1255,11 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) @@ -1212,22 +1272,22 @@ def __add__(self, other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these if not is_period_dtype(self): - maybe_integer_op_deprecated(self) + raise integer_op_not_supported(self) result = self._time_shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.add) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): if not is_period_dtype(self): - maybe_integer_op_deprecated(self) + raise integer_op_not_supported(self) result = self._addsub_int_array(other, operator.add) else: # Includes Categorical, other ExtensionArrays @@ -1247,13 +1307,11 @@ def __radd__(self, other): # alias for __add__ return self.__add__(other) + @unpack_zerodim_and_defer("__sub__") def __sub__(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented # scalar others - elif other is NaT: + if other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) @@ -1266,7 +1324,7 @@ def __sub__(self, other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these if not is_period_dtype(self): - maybe_integer_op_deprecated(self) + raise integer_op_not_supported(self) result = self._time_shift(-other) elif isinstance(other, Period): @@ -1276,9 +1334,9 @@ def __sub__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.sub) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) @@ -1287,7 +1345,7 @@ def __sub__(self, other): result = self._sub_period_array(other) elif is_integer_dtype(other): if not is_period_dtype(self): - maybe_integer_op_deprecated(self) + raise integer_op_not_supported(self) result = self._addsub_int_array(other, operator.sub) else: # Includes ExtensionArrays, float_dtype @@ -1303,6 +1361,9 @@ def __rsub__(self, other): if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation + if lib.is_scalar(other): + # i.e. np.datetime64 object + return Timestamp(other) - self if not isinstance(other, DatetimeLikeArrayMixin): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray @@ -1317,17 +1378,11 @@ def __rsub__(self, other): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. raise TypeError( - "cannot subtract {cls} from {typ}".format( - cls=type(self).__name__, typ=type(other).__name__ - ) + f"cannot subtract {type(self).__name__} from {type(other).__name__}" ) elif is_period_dtype(self.dtype) and is_timedelta64_dtype(other): # TODO: Can we simplify/generalize these cases at all? - raise TypeError( - "cannot subtract {cls} from {dtype}".format( - cls=type(self).__name__, dtype=other.dtype - ) - ) + raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") elif is_timedelta64_dtype(self.dtype): if lib.is_integer(other) or is_integer_dtype(other): # need to subtract before negating, since that flips freq @@ -1338,53 +1393,23 @@ def __rsub__(self, other): return -(self - other) - # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 - def __iadd__(self, other): - # alias for __add__ - return self.__add__(other) - - def __isub__(self, other): - # alias for __sub__ - return self.__sub__(other) + def __iadd__(self, other): # type: ignore + result = self + other + self[:] = result[:] - # -------------------------------------------------------------- - # Comparison Methods - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - """ - Ensure that we are re-localized. - - This is for compat as we can then call this on all datetimelike - arrays generally (ignored for Period/Timedelta) - - Parameters - ---------- - arg : Union[DatetimeLikeArray, DatetimeIndexOpsMixin, ndarray] - ambiguous : str, bool, or bool-ndarray, default 'raise' - nonexistent : str, default 'raise' - from_utc : bool, default False - If True, localize the i8 ndarray to UTC first before converting to - the appropriate tz. If False, localize directly to the tz. + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self - Returns - ------- - localized array - """ + def __isub__(self, other): # type: ignore + result = self - other + self[:] = result[:] - # reconvert to local tz - tz = getattr(self, "tz", None) - if tz is not None: - if not isinstance(arg, type(self)): - arg = self._simple_new(arg) - if from_utc: - arg = arg.tz_localize("UTC").tz_convert(self.tz) - else: - arg = arg.tz_localize( - self.tz, ambiguous=ambiguous, nonexistent=nonexistent - ) - return arg + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self # -------------------------------------------------------------- # Reductions @@ -1476,9 +1501,9 @@ def mean(self, skipna=True): if is_period_dtype(self): # See discussion in GH#24757 raise TypeError( - "mean is not implemented for {cls} since the meaning is " - "ambiguous. An alternative is " - "obj.to_timestamp(how='start').mean()".format(cls=type(self).__name__) + f"mean is not implemented for {type(self).__name__} since the " + "meaning is ambiguous. An alternative is " + "obj.to_timestamp(how='start').mean()" ) mask = self.isna() @@ -1490,7 +1515,7 @@ def mean(self, skipna=True): values = self if not len(values): - # short-circut for empty max / min + # short-circuit for empty max / min return NaT result = nanops.nanmean(values.view("i8"), skipna=skipna) @@ -1498,6 +1523,8 @@ def mean(self, skipna=True): return self._box_func(result) +DatetimeLikeArrayMixin._add_comparison_ops() + # ------------------------------------------------------------------- # Shared Constructor Helpers @@ -1524,9 +1551,7 @@ def validate_periods(periods): if lib.is_float(periods): periods = int(periods) elif not lib.is_integer(periods): - raise TypeError( - "periods must be a number, got {periods}".format(periods=periods) - ) + raise TypeError(f"periods must be a number, got {periods}") return periods @@ -1587,9 +1612,9 @@ def validate_inferred_freq(freq, inferred_freq, freq_infer): if inferred_freq is not None: if freq is not None and freq != inferred_freq: raise ValueError( - "Inferred frequency {inferred} from passed " + f"Inferred frequency {inferred_freq} from passed " "values does not conform to passed frequency " - "{passed}".format(inferred=inferred_freq, passed=freq.freqstr) + f"{freq.freqstr}" ) elif freq is None: freq = inferred_freq @@ -1623,38 +1648,3 @@ def maybe_infer_freq(freq): freq_infer = True freq = None return freq, freq_infer - - -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - Helper for coercing an input scalar or array to i8. - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - from pandas import Index - - if lib.is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, (ABCPeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)): - # convert tz if needed - if getattr(other, "tz", None) is not None: - if to_utc: - other = other.tz_convert("UTC") - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view("i8") - except TypeError: - # period array cannot be coerced to int - other = Index(other) - return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 788cd2a3ce5b7..e42402b307f28 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,5 +1,4 @@ from datetime import datetime, time, timedelta -import textwrap from typing import Union import warnings @@ -19,19 +18,18 @@ timezones, tzconversion, ) -import pandas.compat as compat from pandas.errors import PerformanceWarning -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_type, + is_extension_array_dtype, is_float_dtype, is_object_dtype, is_period_dtype, @@ -40,40 +38,18 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCPandasArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset from pandas.tseries.offsets import Day, Tick _midnight = time(0, 0) -# TODO(GH-24559): Remove warning, int_as_wall_time parameter. -_i8_message = """ - Passing integer-dtype data and a timezone to DatetimeIndex. Integer values - will be interpreted differently in a future version of pandas. Previously, - these were viewed as datetime64[ns] values representing the wall time - *in the specified timezone*. In the future, these will be viewed as - datetime64[ns] values representing the wall time *in UTC*. This is similar - to a nanosecond-precision UNIX epoch. To accept the future behavior, use - - pd.to_datetime(integer_data, utc=True).tz_convert(tz) - - To keep the previous behavior, use - - pd.to_datetime(integer_data).tz_localize(tz) -""" def tz_to_dtype(tz): @@ -94,22 +70,6 @@ def tz_to_dtype(tz): return DatetimeTZDtype(tz=tz) -def _to_M8(key, tz=None): - """ - Timestamp-like => dt64 - """ - if not isinstance(key, Timestamp): - # this also converts strings - key = Timestamp(key) - if key.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - key = key.tz_convert(tz) - else: - key = key.tz_localize(tz) - - return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE) - - def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 @@ -150,90 +110,6 @@ def f(self): return property(f) -def _dt_array_cmp(cls, op): - """ - Wrap comparison operations to convert datetime-like to datetime64 - """ - opname = "__{name}__".format(name=op.__name__) - nat_result = opname == "__ne__" - - def wrapper(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - other = lib.item_from_zerodim(other) - - if isinstance(other, (datetime, np.datetime64, str)): - if isinstance(other, (datetime, np.datetime64)): - # GH#18435 strings get a pass from tzawareness compat - self._assert_tzawareness_compat(other) - - try: - other = _to_M8(other, tz=self.tz) - except ValueError: - # string that cannot be parsed to Timestamp - return invalid_comparison(self, other, op) - - result = op(self.asi8, other.view("i8")) - if isna(other): - result.fill(nat_result) - elif lib.is_scalar(other) or np.ndim(other) == 0: - return invalid_comparison(self, other, op) - elif len(other) != len(self): - raise ValueError("Lengths must match") - else: - if isinstance(other, list): - try: - other = type(self)._from_sequence(other) - except ValueError: - other = np.array(other, dtype=np.object_) - elif not isinstance( - other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray) - ): - # Following Timestamp convention, __eq__ is all-False - # and __ne__ is all True, others raise TypeError. - return invalid_comparison(self, other, op) - - if is_object_dtype(other): - # We have to use comp_method_OBJECT_ARRAY instead of numpy - # comparison otherwise it would fail to raise when - # comparing tz-aware and tz-naive - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY( - op, self.astype(object), other - ) - o_mask = isna(other) - elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): - # e.g. is_timedelta64_dtype(other) - return invalid_comparison(self, other, op) - else: - self._assert_tzawareness_compat(other) - if isinstance(other, (ABCIndexClass, ABCSeries)): - other = other.array - - if ( - is_datetime64_dtype(other) - and not is_datetime64_ns_dtype(other) - or not hasattr(other, "asi8") - ): - # e.g. other.dtype == 'datetime64[s]' - # or an object-dtype ndarray - other = type(self)._from_sequence(other) - - result = op(self.view("i8"), other.view("i8")) - o_mask = other._isnan - - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -253,12 +129,12 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps The datetime data. For DatetimeArray `values` (or a Series or Index boxing one), - `dtype` and `freq` will be extracted from `values`, with - precedence given to + `dtype` and `freq` will be extracted from `values`. dtype : numpy.dtype or DatetimeTZDtype Note that the only NumPy dtype allowed is 'datetime64[ns]'. freq : str or Offset, optional + The frequency. copy : bool, default False Whether to copy the underlying array of values. @@ -273,6 +149,8 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps _typ = "datetimearray" _scalar_type = Timestamp + _recognized_scalars = (datetime, np.datetime64) + _is_recognized_dtype = is_datetime64_any_dtype # define my properties & methods for delegation _bool_ops = [ @@ -284,7 +162,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps "is_year_end", "is_leap_year", ] - _object_ops = ["weekday_name", "freq", "tz"] + _object_ops = ["freq", "tz"] _field_ops = [ "year", "month", @@ -327,7 +205,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps # ----------------------------------------------------------------- # Constructors - _dtype = None # type: Union[np.dtype, DatetimeTZDtype] + _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): @@ -345,23 +223,23 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if not timezones.tz_compare(dtz, values.tz): msg = ( "Timezone of the array and 'dtype' do not match. " - "'{}' != '{}'" + f"'{dtz}' != '{values.tz}'" ) - raise TypeError(msg.format(dtz, values.tz)) + raise TypeError(msg) elif values.tz: dtype = values.dtype - # freq = validate_values_freq(values, freq) + if freq is None: freq = values.freq values = values._data if not isinstance(values, np.ndarray): msg = ( - "Unexpected type '{}'. 'values' must be a DatetimeArray " - "ndarray, or Series or Index containing one of those." + f"Unexpected type '{type(values).__name__}'. 'values' must be " + "a DatetimeArray ndarray, or Series or Index containing one of those." ) - raise ValueError(msg.format(type(values).__name__)) - if values.ndim != 1: + raise ValueError(msg) + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -373,9 +251,9 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if values.dtype != _NS_DTYPE: msg = ( "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." - " Got {} instead." + f" Got {values.dtype} instead." ) - raise ValueError(msg.format(values.dtype)) + raise ValueError(msg) dtype = _validate_dt64_dtype(dtype) @@ -428,7 +306,6 @@ def _from_sequence( dayfirst=False, yearfirst=False, ambiguous="raise", - int_as_wall_time=False, ): freq, freq_infer = dtl.maybe_infer_freq(freq) @@ -441,7 +318,6 @@ def _from_sequence( dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous, - int_as_wall_time=int_as_wall_time, ) freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) @@ -580,15 +456,14 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return - if not timezones.tz_compare(self.tz, other.tz): - raise ValueError( - "Timezones don't match. '{own} != {other}'".format( - own=self.tz, other=other.tz - ) - ) + self._assert_tzawareness_compat(other) + if setitem: + # Stricter check for setitem vs comparison methods + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") def _maybe_clear_freq(self): self._freq = None @@ -671,7 +546,7 @@ def _resolution(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object @@ -728,23 +603,6 @@ def astype(self, dtype, copy=True): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) - # ---------------------------------------------------------------- - # ExtensionArray Interface - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (datetime, np.datetime64)): - self._assert_tzawareness_compat(fill_value) - fill_value = Timestamp(fill_value).value - else: - raise ValueError( - "'fill_value' should be a Timestamp. " - "Got '{got}'.".format(got=fill_value) - ) - return fill_value - # ----------------------------------------------------------------- # Rendering Methods @@ -760,8 +618,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ----------------------------------------------------------------- # Comparison Methods - _create_comparison_method = classmethod(_dt_array_cmp) - def _has_same_tz(self, other): zzone = self._timezone @@ -806,8 +662,8 @@ def _sub_datetime_arraylike(self, other): if not self._has_same_tz(other): # require tz compat raise TypeError( - "{cls} subtraction must have the same " - "timezones or no timezones".format(cls=type(self).__name__) + f"{type(self).__name__} subtraction must have the same " + "timezones or no timezones" ) self_i8 = self.asi8 @@ -819,15 +675,16 @@ def _sub_datetime_arraylike(self, other): return new_values.view("timedelta64[ns]") def _add_offset(self, offset): + if self.ndim == 2: + return self.ravel()._add_offset(offset).reshape(self.shape) + assert not isinstance(offset, Tick) try: if self.tz is not None: values = self.tz_localize(None) else: values = self - result = offset.apply_index(values) - if self.tz is not None: - result = result.tz_localize(self.tz) + result = offset.apply_index(values).tz_localize(self.tz) except NotImplementedError: warnings.warn( @@ -835,6 +692,9 @@ def _add_offset(self, offset): PerformanceWarning, ) result = self.astype("O") + offset + if not len(self): + # GH#30336 _from_sequence won't be able to infer self.tz + return type(self)._from_sequence(result).tz_localize(self.tz) return type(self)._from_sequence(result, freq="infer") @@ -962,7 +822,7 @@ def tz_convert(self, tz): dtype = tz_to_dtype(tz) return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) - def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): + def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): """ Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. @@ -1011,17 +871,6 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): .. versionadded:: 0.24.0 - errors : {'raise', 'coerce'}, default None - The method to handle errors: - - - 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified time zone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - - 'coerce' will return NaT if the timestamp can not be converted - to the specified time zone. Use ``nonexistent='NaT'`` instead. - - .. deprecated:: 0.24.0 - Returns ------- Same type as self @@ -1112,31 +961,14 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): 1 2015-03-29 03:30:00+02:00 dtype: datetime64[ns, 'Europe/Warsaw'] """ - if errors is not None: - warnings.warn( - "The errors argument is deprecated and will be " - "removed in a future release. Use " - "nonexistent='NaT' or nonexistent='raise' " - "instead.", - FutureWarning, - ) - if errors == "coerce": - nonexistent = "NaT" - elif errors == "raise": - nonexistent = "raise" - else: - raise ValueError( - "The errors argument must be either 'coerce' or 'raise'." - ) - nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta ): raise ValueError( - "The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object" + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" ) if self.tz is not None: @@ -1209,7 +1041,7 @@ def normalize(self): """ if self.tz is None or timezones.is_utc(self.tz): not_null = ~self.isna() - DAY_NS = ccalendar.DAY_SECONDS * 1000000000 + DAY_NS = ccalendar.DAY_SECONDS * 1_000_000_000 new_values = self.asi8.copy() adjustment = new_values[not_null] % DAY_NS new_values[not_null] = new_values[not_null] - adjustment @@ -1516,14 +1348,6 @@ def date(self): dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) weekday = dayofweek - weekday_name = _field_accessor( - "weekday_name", - "weekday_name", - """ - The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0 - """, - ) - dayofyear = _field_accessor( "dayofyear", "doy", @@ -1833,7 +1657,7 @@ def to_julian_date(self): + np.floor(year / 4) - np.floor(year / 100) + np.floor(year / 400) - + 1721118.5 + + 1_721_118.5 + ( self.hour + self.minute / 60.0 @@ -1845,9 +1669,6 @@ def to_julian_date(self): ) -DatetimeArray._add_comparison_ops() - - # ------------------------------------------------------------------- # Constructor Helpers @@ -1860,7 +1681,6 @@ def sequence_to_dt64ns( dayfirst=False, yearfirst=False, ambiguous="raise", - int_as_wall_time=False, ): """ Parameters @@ -1873,13 +1693,6 @@ def sequence_to_dt64ns( yearfirst : bool, default False ambiguous : str, bool, or arraylike, default 'raise' See pandas._libs.tslibs.conversion.tz_localize_to_utc. - int_as_wall_time : bool, default False - Whether to treat ints as wall time in specified timezone, or as - nanosecond-precision UNIX epoch (wall time in UTC). - This is used in DatetimeIndex.__init__ to deprecate the wall-time - behaviour. - - ..versionadded:: 0.24.0 Returns ------- @@ -1940,10 +1753,6 @@ def sequence_to_dt64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) tz = maybe_infer_tz(tz, inferred_tz) - # When a sequence of timestamp objects is passed, we always - # want to treat the (now i8-valued) data as UTC timestamps, - # not wall times. - int_as_wall_time = False # `data` may have originally been a Categorical[datetime64[ns, tz]], # so we need to handle these types. @@ -1977,12 +1786,6 @@ def sequence_to_dt64ns( if data.dtype != _INT64_DTYPE: data = data.astype(np.int64, copy=False) - if int_as_wall_time and tz is not None and not timezones.is_utc(tz): - warnings.warn(_i8_message, FutureWarning, stacklevel=4) - data = conversion.tz_localize_to_utc( - data.view("i8"), tz, ambiguous=ambiguous - ) - data = data.view(_NS_DTYPE) result = data.view(_NS_DTYPE) if copy: @@ -2109,19 +1912,13 @@ def maybe_convert_dtype(data, copy): # with integer dtypes. See discussion in GH#23675 elif is_timedelta64_dtype(data): - warnings.warn( - "Passing timedelta64-dtype data is deprecated, will " - "raise a TypeError in a future version", - FutureWarning, - stacklevel=5, - ) - data = data.view(_NS_DTYPE) - + # GH#29794 enforcing deprecation introduced in GH#23539 + raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]") elif is_period_dtype(data): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails raise TypeError( - "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" + "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead" ) elif is_categorical_dtype(data): @@ -2131,7 +1928,7 @@ def maybe_convert_dtype(data, copy): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False - elif is_extension_type(data) and not is_datetime64tz_dtype(data): + elif is_extension_array_dtype(data) and not is_datetime64tz_dtype(data): # Includes categorical # TODO: We have no tests for these data = np.array(data, dtype=np.object_) @@ -2168,8 +1965,8 @@ def maybe_infer_tz(tz, inferred_tz): pass elif not timezones.tz_compare(tz, inferred_tz): raise TypeError( - "data is already tz-aware {inferred_tz}, unable to " - "set specified tz: {tz}".format(inferred_tz=inferred_tz, tz=tz) + f"data is already tz-aware {inferred_tz}, unable to " + f"set specified tz: {tz}" ) return tz @@ -2199,22 +1996,19 @@ def _validate_dt64_dtype(dtype): if dtype is not None: dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, np.dtype("M8")): - # no precision, warn - dtype = _NS_DTYPE - msg = textwrap.dedent( - """\ - Passing in 'datetime64' dtype with no precision is deprecated - and will raise in a future version. Please pass in - 'datetime64[ns]' instead.""" + # no precision, disallowed GH#24806 + msg = ( + "Passing in 'datetime64' dtype with no precision is not allowed. " + "Please pass in 'datetime64[ns]' instead." ) - warnings.warn(msg, FutureWarning, stacklevel=5) + raise ValueError(msg) if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance( dtype, (np.dtype, DatetimeTZDtype) ): raise ValueError( - "Unexpected value for 'dtype': '{dtype}'. " - "Must be 'datetime64[ns]' or DatetimeTZDtype'.".format(dtype=dtype) + f"Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[ns]' or DatetimeTZDtype'." ) return dtype diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 630c3e50f2c09..cb1e7115cd3c2 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,10 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -21,14 +21,15 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.ops import invalid_comparison +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric +from .masked import BaseMaskedArray + class _IntegerDtype(ExtensionDtype): """ @@ -40,14 +41,14 @@ class _IntegerDtype(ExtensionDtype): The attributes name & type are set when these subclasses are created. """ - name = None # type: str + name: str base = None - type = None # type: Type - na_value = np.nan + type: Type + na_value = libmissing.NA - def __repr__(self): + def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" - return "{sign}Int{size}Dtype()".format(sign=sign, size=8 * self.itemsize) + return f"{sign}Int{8 * self.itemsize}Dtype()" @cache_readonly def is_signed_integer(self): @@ -77,7 +78,8 @@ def itemsize(self): @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype. Returns ------- @@ -85,6 +87,25 @@ def construct_array_type(cls): """ return IntegerArray + def __from_arrow__(self, array): + """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + int_arr = IntegerArray(data.copy(), ~mask, copy=False) + results.append(int_arr) + + return IntegerArray._concat_same_type(results) + def integer_array(values, dtype=None, copy=False): """ @@ -126,9 +147,7 @@ def safe_cast(values, dtype, copy): return casted raise TypeError( - "cannot safely cast non-equivalent {} to {}".format( - values.dtype, np.dtype(dtype) - ) + f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" ) @@ -165,7 +184,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError: - raise ValueError("invalid dtype specified {}".format(dtype)) + raise ValueError(f"invalid dtype specified {dtype}") if isinstance(values, IntegerArray): values, mask = values._data, values._mask @@ -190,17 +209,13 @@ def coerce_to_array(values, dtype, mask=None, copy=False): "integer-na", "mixed-integer-float", ]: - raise TypeError( - "{} cannot be converted to an IntegerDtype".format(values.dtype) - ) + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): - raise TypeError( - "{} cannot be converted to an IntegerDtype".format(values.dtype) - ) + raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") if mask is None: mask = isna(values) @@ -232,12 +247,17 @@ def coerce_to_array(values, dtype, mask=None, copy=False): return values, mask -class IntegerArray(ExtensionArray, ExtensionOpsMixin): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -281,22 +301,25 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) >>> int_array - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 String aliases for the dtypes are also available. They are capitalized. >>> pd.array([1, None, 3], dtype='Int32') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 >>> pd.array([1, None, 3], dtype='UInt16') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: UInt16 """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + @cache_readonly def dtype(self): return _dtypes[str(self._data.dtype)] @@ -333,48 +356,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - return type(self)(self._data[item], self._mask[item]) - - def _coerce_to_ndarray(self): - """ - coerce to an ndarary of object dtype - """ - - # TODO(jreback) make this better - data = self._data.astype(object) - data[self._mask] = self._na_value - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - return self._coerce_to_ndarray() - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -422,40 +403,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with 1 internally - # to avoid upcasting - data_fill_value = 1 if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -469,26 +416,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return np.nan - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or IntegerArray with 'dtype'. @@ -520,8 +447,14 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() - return astype_nansafe(data, dtype, copy=None) + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) @property def _ndarray_values(self) -> np.ndarray: @@ -534,52 +467,10 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), - dtype=object, - ) - - return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self.to_numpy(na_value=np.nan), np.nan def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. @@ -602,16 +493,13 @@ def _values_for_argsort(self) -> np.ndarray: def _create_comparison_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -623,23 +511,37 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -649,10 +551,11 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + # We explicitly use NaN within reductions. + data[mask] = np.nan op = getattr(nanops, "nan" + name) - result = op(data, axis=0, skipna=skipna, mask=mask) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) # if we have a boolean op, don't coerce if name in ["any", "all"]: @@ -677,11 +580,6 @@ def _maybe_mask_result(self, result, mask, other, op_name): op_name : str """ - # may need to fill infs - # and mask wraparound - if is_float_dtype(result): - mask |= (result == np.inf) | (result == -np.inf) - # if we have a float operand we are by-definition # a float result # or our op is a divide @@ -697,17 +595,16 @@ def _maybe_mask_result(self, result, mask, other, op_name): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op.__name__) def integer_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented + omask = None - other = lib.item_from_zerodim(other) - mask = None + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") if isinstance(other, IntegerArray): - other, mask = other._data, other._mask + other, omask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) @@ -721,24 +618,39 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - # nans propagate - if mask is None: - mask = self._mask + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: - mask = self._mask | mask + mask = self._mask | omask - # 1 ** np.nan is 1. So we have to unmask those. if op_name == "pow": - mask = np.where(self == 1, False, mask) + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) elif op_name == "rpow": - mask = np.where(other == 1, False, mask) - - with np.errstate(all="ignore"): - result = op(self._data, other) + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": @@ -750,7 +662,7 @@ def integer_arithmetic_method(self, other): return self._maybe_mask_result(result, mask, other, op_name) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(integer_arithmetic_method, name, cls) @@ -761,6 +673,11 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + Attributes ---------- None diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4039cc91fb554..37d2baed2c09e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -17,6 +17,8 @@ is_integer_dtype, is_interval, is_interval_dtype, + is_list_like, + is_object_dtype, is_scalar, is_string_dtype, is_timedelta64_dtype, @@ -37,6 +39,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com +from pandas.core.construction import array from pandas.core.indexes.base import ensure_index _VALID_CLOSED = {"left", "right", "both", "neither"} @@ -105,7 +108,7 @@ Notes ----- See the `user guide -`_ +`_ for more. %(examples)s\ @@ -159,10 +162,10 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=Tru # don't allow scalars if is_scalar(data): msg = ( - "{}(...) must be called with a collection of some kind," - " {} was passed" + f"{cls.__name__}(...) must be called with a collection " + f"of some kind, {data} was passed" ) - raise TypeError(msg.format(cls.__name__, data)) + raise TypeError(msg) # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) @@ -194,8 +197,8 @@ def _simple_new( # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): - msg = "dtype must be an IntervalDtype, got {dtype}" - raise TypeError(msg.format(dtype=dtype)) + msg = f"dtype must be an IntervalDtype, got {dtype}" + raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) @@ -207,10 +210,11 @@ def _simple_new( left = left.astype(right.dtype) if type(left) != type(right): - msg = "must not have differing left [{ltype}] and right [{rtype}] types" - raise ValueError( - msg.format(ltype=type(left).__name__, rtype=type(right).__name__) + msg = ( + f"must not have differing left [{type(left).__name__}] and " + f"right [{type(right).__name__}] types" ) + raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ( @@ -224,9 +228,9 @@ def _simple_new( elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ( "left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'" + f"'{left.tz}' and '{right.tz}'" ) - raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) + raise ValueError(msg) result._left = left result._right = right @@ -260,9 +264,9 @@ def _from_factorized(cls, values, original): Whether the intervals are closed on the left-side, right-side, both or neither. copy : bool, default False - copy the data + Copy the data. dtype : dtype or None, default None - If None, dtype will be inferred + If None, dtype will be inferred. .. versionadded:: 0.23.0 @@ -383,16 +387,16 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): Parameters ---------- data : array-like (1-dimensional) - Array of tuples + Array of tuples. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. copy : bool, default False - by-default copy the data, this is compat only and ignored + By-default copy the data, this is compat only and ignored. dtype : dtype or None, default None - If None, dtype will be inferred + If None, dtype will be inferred. - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Returns ------- @@ -443,14 +447,10 @@ def from_tuples(cls, data, closed="right", copy=False, dtype=None): # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] lhs, rhs = d except ValueError: - msg = ( - "{name}.from_tuples requires tuples of length 2, got {tpl}" - ).format(name=name, tpl=d) + msg = f"{name}.from_tuples requires tuples of length 2, got {d}" raise ValueError(msg) except TypeError: - msg = ("{name}.from_tuples received an invalid item, {tpl}").format( - name=name, tpl=d - ) + msg = f"{name}.from_tuples received an invalid item, {d}" raise TypeError(msg) left.append(lhs) right.append(rhs) @@ -468,20 +468,22 @@ def _validate(self): * left is always below right """ if self.closed not in _VALID_CLOSED: - raise ValueError( - "invalid option for 'closed': {closed}".format(closed=self.closed) - ) + msg = f"invalid option for 'closed': {self.closed}" + raise ValueError(msg) if len(self.left) != len(self.right): - raise ValueError("left and right must have the same length") + msg = "left and right must have the same length" + raise ValueError(msg) left_mask = notna(self.left) right_mask = notna(self.right) if not (left_mask == right_mask).all(): - raise ValueError( + msg = ( "missing values must be missing in the same " "location both left and right sides" ) + raise ValueError(msg) if not (self.left[left_mask] <= self.right[left_mask]).all(): - raise ValueError("left side of interval must be <= right side") + msg = "left side of interval must be <= right side" + raise ValueError(msg) # --------- # Interface @@ -489,7 +491,7 @@ def _validate(self): def __iter__(self): return iter(np.asarray(self)) - def __len__(self): + def __len__(self) -> int: return len(self.left) def __getitem__(self, value): @@ -498,8 +500,11 @@ def __getitem__(self, value): # scalar if not isinstance(left, ABCIndexClass): - if isna(left): + if is_scalar(left) and isna(left): return self._fill_value + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) return self._shallow_copy(left, right) @@ -531,8 +536,8 @@ def __setitem__(self, key, value): value_left, value_right = array.left, array.right except TypeError: # wrong type: not interval or NA - msg = "'value' should be an interval type, got {} instead." - raise TypeError(msg.format(type(value))) + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. @@ -548,6 +553,58 @@ def __setitem__(self, key, value): right.values[key] = value_right self._right = right + def __eq__(self, other): + # ensure pandas array for list-like and eliminate non-interval scalars + if is_list_like(other): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other = array(other) + elif not isinstance(other, Interval): + # non-interval scalar -> no matches + return np.zeros(len(self), dtype=bool) + + # determine the dtype of the elements we want to compare + if isinstance(other, Interval): + other_dtype = "interval" + elif not is_categorical_dtype(other): + other_dtype = other.dtype + else: + # for categorical defer to categories for dtype + other_dtype = other.categories.dtype + + # extract intervals if we have interval categories with matching closed + if is_interval_dtype(other_dtype): + if self.closed != other.categories.closed: + return np.zeros(len(self), dtype=bool) + other = other.categories.take(other.codes) + + # interval-like -> need same closed and matching endpoints + if is_interval_dtype(other_dtype): + if self.closed != other.closed: + return np.zeros(len(self), dtype=bool) + return (self.left == other.left) & (self.right == other.right) + + # non-interval/non-object dtype -> no matches + if not is_object_dtype(other_dtype): + return np.zeros(len(self), dtype=bool) + + # object dtype -> iteratively check for intervals + result = np.zeros(len(self), dtype=bool) + for i, obj in enumerate(other): + # need object to be an Interval with same closed and endpoints + if ( + isinstance(obj, Interval) + and self.closed == obj.closed + and self.left[i] == obj.left + and self.right[i] == obj.right + ): + result[i] = True + + return result + + def __ne__(self, other): + return ~self.__eq__(other) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -583,9 +640,7 @@ def fillna(self, value=None, method=None, limit=None): if not isinstance(value, ABCInterval): msg = ( "'IntervalArray.fillna' only supports filling with a " - "scalar 'pandas.Interval'. Got a '{}' instead.".format( - type(value).__name__ - ) + f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." ) raise TypeError(msg) @@ -630,10 +685,9 @@ def astype(self, dtype, copy=True): new_right = self.right.astype(dtype.subtype) except TypeError: msg = ( - "Cannot convert {dtype} to {new_dtype}; subtypes are " - "incompatible" + f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) - raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) + raise TypeError(msg) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) @@ -641,8 +695,8 @@ def astype(self, dtype, copy=True): try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): - msg = "Cannot cast {name} to dtype {dtype}" - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" + raise TypeError(msg) @classmethod def _concat_same_type(cls, to_concat): @@ -790,9 +844,8 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): elif not is_scalar(fill_value) and notna(fill_value): msg = ( "'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. Got a '{}' instead.".format( - type(fill_value).__name__ - ) + "'scalar pandas.Interval or NA'. " + f"Got a '{type(fill_value).__name__}' instead." ) raise ValueError(msg) @@ -840,48 +893,44 @@ def _format_data(self): summary = "[]" elif n == 1: first = formatter(self[0]) - summary = "[{first}]".format(first=first) + summary = f"[{first}]" elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = "[{first}, {last}]".format(first=first, last=last) + summary = f"[{first}, {last}]" else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = "[{head} ... {tail}]".format( - head=", ".join(head), tail=", ".join(tail) - ) + head_str = ", ".join(head) + tail_str = ", ".join(tail) + summary = f"[{head_str} ... {tail_str}]" else: tail = [formatter(x) for x in self] - summary = "[{tail}]".format(tail=", ".join(tail)) + tail_str = ", ".join(tail) + summary = f"[{tail_str}]" return summary - def __repr__(self): - template = ( - "{class_name}" - "{data}\n" - "Length: {length}, closed: {closed}, dtype: {dtype}" - ) + def __repr__(self) -> str: # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary data = self._format_data() - class_name = "<{}>\n".format(self.__class__.__name__) - return template.format( - class_name=class_name, - data=data, - length=len(self), - closed=self.closed, - dtype=self.dtype, + class_name = f"<{type(self).__name__}>\n" + + template = ( + f"{class_name}" + f"{data}\n" + f"Length: {len(self)}, closed: {self.closed}, dtype: {self.dtype}" ) + return template def _format_space(self): - space = " " * (len(self.__class__.__name__) + 1) - return "\n{space}".format(space=space) + space = " " * (len(type(self).__name__) + 1) + return f"\n{space}" @property def left(self): @@ -951,8 +1000,8 @@ def closed(self): ) def set_closed(self, closed): if closed not in _VALID_CLOSED: - msg = "invalid option for 'closed': {closed}" - raise ValueError(msg.format(closed=closed)) + msg = f"invalid option for 'closed': {closed}" + raise ValueError(msg) return self._shallow_copy(closed=closed) @@ -1017,7 +1066,7 @@ def is_non_overlapping_monotonic(self): ) # Conversion - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') @@ -1035,6 +1084,59 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError: + raise TypeError( + "Conversion to arrow with subtype '{}' " + "is not supported".format(self.dtype.subtype) + ) + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) " + "attributes".format( + self.dtype.subtype, type.subtype, self.closed, type.closed + ) + ) + else: + raise TypeError( + "Not supported to convert IntervalArray to '{0}' type".format(type) + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + _interval_shared_docs[ "to_tuples" ] = """ @@ -1188,8 +1290,8 @@ def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError elif not isinstance(other, Interval): - msg = "`other` must be Interval-like, got {other}" - raise TypeError(msg.format(other=type(other).__name__)) + msg = f"`other` must be Interval-like, got {type(other).__name__}" + raise TypeError(msg) # equality is okay if both endpoints are closed (overlap at a point) op1 = le if (self.closed_left and other.closed_right) else lt diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py new file mode 100644 index 0000000000000..47605413ff1a6 --- /dev/null +++ b/pandas/core/arrays/masked.py @@ -0,0 +1,250 @@ +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + _data: np.ndarray + _mask: np.ndarray + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: "Scalar" + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + + return type(self)(self._data[item], self._mask[item]) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def __len__(self) -> int: + return len(self._data) + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, NA], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + if na_value is lib.no_default: + na_value = libmissing.NA + if dtype is None: + dtype = object + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None) -> np.ndarray: + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + from pandas.arrays import IntegerArray + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(object) + + # if we want nans, count the mask + if dropna: + counts = value_counts.values + else: + counts = np.empty(len(value_counts) + 1, dtype="int64") + counts[:-1] = value_counts + counts[-1] = self._mask.sum() + + index = Index( + np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + dtype=object, + ) + + mask = np.zeros(len(counts), dtype="bool") + counts = IntegerArray(counts, mask) + + return Series(counts, index=index) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index bf7404e8997c6..4db3d3010adaf 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -1,4 +1,5 @@ import numbers +from typing import Union import numpy as np from numpy.lib.mixins import NDArrayOperatorsMixin @@ -16,11 +17,12 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d -from .base import ExtensionArray, ExtensionOpsMixin - class PandasDtype(ExtensionDtype): """ @@ -44,8 +46,8 @@ def __init__(self, dtype): self._name = dtype.name self._type = dtype.type - def __repr__(self): - return "PandasDtype({!r})".format(self.name) + def __repr__(self) -> str: + return f"PandasDtype({repr(self.name)})" @property def numpy_dtype(self): @@ -71,9 +73,22 @@ def _is_boolean(self): @classmethod def construct_from_string(cls, string): - return cls(np.dtype(string)) + try: + return cls(np.dtype(string)) + except TypeError as err: + raise TypeError( + f"Cannot construct a 'PandasDtype' from '{string}'" + ) from err + @classmethod def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ return PandasArray @property @@ -117,18 +132,17 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): # pandas internals, which turns off things like block consolidation. _typ = "npy_extension" __array_priority__ = 1000 + _ndarray: np.ndarray # ------------------------------------------------------------------------ # Constructors - def __init__(self, values, copy=False): + def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False): if isinstance(values, type(self)): values = values._ndarray if not isinstance(values, np.ndarray): raise ValueError( - "'values' must be a NumPy array, not {typ}".format( - typ=type(values).__name__ - ) + f"'values' must be a NumPy array, not {type(values).__name__}" ) if values.ndim != 1: @@ -168,7 +182,7 @@ def dtype(self): # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) _HANDLED_TYPES = (np.ndarray, numbers.Number) @@ -221,6 +235,9 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + result = self._ndarray[item] if not lib.is_scalar(item): result = type(self)(result) @@ -259,8 +276,8 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): raise ValueError( - "Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self)) + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" ) value = value[mask] @@ -278,6 +295,9 @@ def fillna(self, value=None, method=None, limit=None): return new_values def take(self, indices, allow_fill=False, fill_value=None): + if fill_value is None: + # Primarily for subclasses + fill_value = self.dtype.na_value result = take( self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) @@ -303,8 +323,8 @@ def _reduce(self, name, skipna=True, **kwargs): if meth: return meth(skipna=skipna, **kwargs) else: - msg = "'{}' does not implement reduction '{}'" - raise TypeError(msg.format(type(self).__name__, name)) + msg = f"'{type(self).__name__}' does not implement reduction '{name}'" + raise TypeError(msg) def any(self, axis=None, out=None, keepdims=False, skipna=True): nv.validate_any((), dict(out=out, keepdims=keepdims)) @@ -400,27 +420,15 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False): - """ - Convert the PandasArray to a :class:`numpy.ndarray`. - - By default, this requires no coercion or copying of data. - - Parameters - ---------- - dtype : numpy.dtype - The NumPy dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to copy the underlying data. - - Returns - ------- - ndarray - """ + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): result = np.asarray(self._ndarray, dtype=dtype) - if copy and result is self._ndarray: + + if (copy or na_value is not lib.no_default) and result is self._ndarray: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result @Appender(ExtensionArray.searchsorted.__doc__) @@ -451,9 +459,7 @@ def arithmetic_method(self, other): return cls(result) - return compat.set_function_name( - arithmetic_method, "__{}__".format(op.__name__), cls - ) + return compat.set_function_name(arithmetic_method, f"__{op.__name__}__", cls) _create_comparison_method = _create_arithmetic_method diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 78cc54db4b1b8..8b49c2186dde0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,7 +4,6 @@ import numpy as np -from pandas._libs import lib from pandas._libs.tslibs import ( NaT, NaTType, @@ -21,21 +20,18 @@ period_asfreq_arr, ) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds -import pandas.compat as compat -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, - is_list_like, is_period_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, @@ -62,52 +58,6 @@ def f(self): return property(f) -def _period_array_cmp(cls, op): - """ - Wrap comparison operations to convert Period-like to PeriodDtype - """ - opname = "__{name}__".format(name=op.__name__) - nat_result = opname == "__ne__" - - def wrapper(self, other): - ordinal_op = getattr(self.asi8, opname) - - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - if is_list_like(other) and len(other) != len(self): - raise ValueError("Lengths must match") - - if isinstance(other, Period): - self._check_compatible_with(other) - - result = ordinal_op(other.ordinal) - elif isinstance(other, cls): - self._check_compatible_with(other) - - result = ordinal_op(other.asi8) - - mask = self._isnan | other._isnan - if mask.any(): - result[mask] = nat_result - - return result - elif other is NaT: - result = np.empty(len(self.asi8), dtype=bool) - result.fill(nat_result) - else: - other = Period(other, freq=self.freq) - result = ordinal_op(other.ordinal) - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. @@ -163,9 +113,11 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): __array_priority__ = 1000 _typ = "periodarray" # ABCPeriodArray _scalar_type = Period + _recognized_scalars = (Period,) + _is_recognized_dtype = is_period_dtype # Names others delegate to us - _other_ops = [] # type: List[str] + _other_ops: List[str] = [] _bool_ops = ["is_leap_year"] _object_ops = ["start_time", "end_time", "freq"] _field_ops = [ @@ -207,12 +159,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): if isinstance(values, type(self)): if freq is not None and freq != values.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=values.freq.freqstr, - other_freq=freq.freqstr, - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(values, freq) values, freq = values._data, values.freq values = np.array(values, dtype="int64", copy=copy) @@ -306,18 +253,16 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: self._check_compatible_with(value) return value.ordinal else: - raise ValueError( - "'value' should be a Period. Got '{val}' instead.".format(val=value) - ) + raise ValueError(f"'value' should be a Period. Got '{value}' instead.") def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return if self.freqstr != other.freqstr: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # -------------------------------------------------------------------- # Data / Attributes @@ -326,7 +271,7 @@ def _check_compatible_with(self, other): def dtype(self): return self._dtype - # read-only property overwriting read/write + # error: Read-only property cannot override read-write property [misc] @property # type: ignore def freq(self): """ @@ -334,10 +279,36 @@ def freq(self): """ return self.dtype.freq - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # overriding DatetimelikeArray return np.array(list(self), dtype=object) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + if type is not None: + if pyarrow.types.is_integer(type): + return pyarrow.array(self._data, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different" + " 'freq' ({0} vs {1})".format(self.freqstr, type.freq) + ) + else: + raise TypeError( + "Not supported to convert PeriodArray to '{0}' type".format(type) + ) + + period_type = ArrowPeriodType(self.freqstr) + storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) + # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -446,8 +417,9 @@ def to_timestamp(self, freq=None, how="start"): ---------- freq : str or DateOffset, optional Target frequency. The default is 'D' for week or longer, - 'S' otherwise + 'S' otherwise. how : {'s', 'e', 'start', 'end'} + Whether to use the start or end of the time period being converted. Returns ------- @@ -482,24 +454,8 @@ def to_timestamp(self, freq=None, how="start"): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def _formatter(self, boxed=False): - if boxed: - return str - return "'{}'".format - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, Period): - self._check_compatible_with(fill_value) - fill_value = fill_value.ordinal - else: - raise ValueError( - "'fill_value' should be a Period. " - "Got '{got}'.".format(got=fill_value) - ) - return fill_value + def _values_for_argsort(self): + return self._data # -------------------------------------------------------------------- @@ -521,7 +477,7 @@ def _time_shift(self, periods, freq=None): if freq is not None: raise TypeError( "`freq` argument is not supported for " - "{cls}._time_shift".format(cls=type(self).__name__) + f"{type(self).__name__}._time_shift" ) values = self.asi8 + periods * self.freq.n if self._hasnans: @@ -539,17 +495,20 @@ def asfreq(self, freq=None, how="E"): Parameters ---------- freq : str - a frequency + A frequency. how : str {'E', 'S'} - 'E', 'END', or 'FINISH' for end, - 'S', 'START', or 'BEGIN' for start. Whether the elements should be aligned to the end - or start within pa period. January 31st ('END') vs. - January 1st ('START') for example. + or start within pa period. + + * 'E', 'END', or 'FINISH' for end, + * 'S', 'START', or 'BEGIN' for start. + + January 31st ('END') vs. January 1st ('START') for example. Returns ------- - new : Period Array/Index with the new frequency + Period Array/Index + Constructed with the new frequency. Examples -------- @@ -591,6 +550,11 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types @@ -600,7 +564,7 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): if date_format: formatter = lambda dt: dt.strftime(date_format) else: - formatter = lambda dt: "%s" % dt + formatter = lambda dt: str(dt) if self._hasnans: mask = self._isnan @@ -622,17 +586,8 @@ def astype(self, dtype, copy=True): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) - @property - def flags(self): - # TODO: remove - # We need this since reduction.SeriesBinGrouper uses values.flags - # Ideally, we wouldn't be passing objects down there in the first - # place. - return self._data.flags - # ------------------------------------------------------------------ # Arithmetic Methods - _create_comparison_method = classmethod(_period_array_cmp) def _sub_datelike(self, other): assert other is not NaT @@ -651,12 +606,23 @@ def _sub_period(self, other): return new_data - @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any], - ) -> ABCPeriodArray: + self, other: np.ndarray, op: Callable[[Any, Any], Any], + ) -> "PeriodArray": + """ + Add or subtract array of integers; equivalent to applying + `_time_shift` pointwise. + + Parameters + ---------- + other : np.ndarray[integer-dtype] + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + assert op in [operator.add, operator.sub] if op is operator.sub: other = -other @@ -669,7 +635,7 @@ def _add_offset(self, other): assert not isinstance(other, Tick) base = libfrequencies.get_base_alias(other.rule_code) if base != self.freq.rule_code: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here @@ -737,7 +703,7 @@ def _add_delta(self, other): """ if not isinstance(self.freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) new_ordinals = super()._add_delta(other) return type(self)(new_ordinals, freq=self.freq) @@ -789,16 +755,10 @@ def _check_timedeltalike_freq_compat(self, other): # by which will be added to self. return delta - _raise_on_incompatible(self, other) - - def _values_for_argsort(self): - return self._data - - -PeriodArray._add_comparison_ops() + raise raise_on_incompatible(self, other) -def _raise_on_incompatible(left, right): +def raise_on_incompatible(left, right): """ Helper function to render a consistent error message when raising IncompatibleFrequency. @@ -806,14 +766,15 @@ def _raise_on_incompatible(left, right): Parameters ---------- left : PeriodArray - right : DateOffset, Period, ndarray, or timedelta-like + right : None, DateOffset, Period, ndarray, or timedelta-like - Raises - ------ + Returns + ------- IncompatibleFrequency + Exception to be raised by the caller. """ # GH#24283 error message format depends on whether right is scalar - if isinstance(right, np.ndarray): + if isinstance(right, np.ndarray) or right is None: other_freq = None elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)): other_freq = right.freqstr @@ -823,7 +784,7 @@ def _raise_on_incompatible(left, right): msg = DIFFERENT_FREQ.format( cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq ) - raise IncompatibleFrequency(msg) + return IncompatibleFrequency(msg) # ------------------------------------------------------------------- @@ -898,9 +859,9 @@ def period_array( data = np.asarray(data) + dtype: Optional[PeriodDtype] if freq: - # typed Optional here because the else block below assigns None - dtype = PeriodDtype(freq) # type: Optional[PeriodDtype] + dtype = PeriodDtype(freq) else: dtype = None @@ -966,7 +927,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): """ if data.dtype != np.dtype("M8[ns]"): - raise ValueError("Wrong dtype: {dtype}".format(dtype=data.dtype)) + raise ValueError(f"Wrong dtype: {data.dtype}") if freq is None: if isinstance(data, ABCIndexClass): diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index 75f3819fb19fd..e928db499a771 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -1,5 +1,10 @@ # flake8: noqa: F401 -from .accessor import SparseAccessor, SparseFrameAccessor -from .array import BlockIndex, IntIndex, SparseArray, _make_index -from .dtype import SparseDtype +from pandas.core.arrays.sparse.accessor import SparseAccessor, SparseFrameAccessor +from pandas.core.arrays.sparse.array import ( + BlockIndex, + IntIndex, + SparseArray, + _make_index, +) +from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 595af6dc08733..92c05f44d677c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -7,9 +7,8 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.accessor import PandasDelegate, delegate_names - -from .array import SparseArray -from .dtype import SparseDtype +from pandas.core.arrays.sparse.array import SparseArray +from pandas.core.arrays.sparse.dtype import SparseDtype class BaseAccessor: @@ -163,7 +162,7 @@ def to_dense(self): Examples -------- - >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) >>> series 0 0 1 1 @@ -216,7 +215,7 @@ def from_spmatrix(cls, data, index=None, columns=None): ------- DataFrame Each column of the DataFrame is stored as a - :class:`SparseArray`. + :class:`arrays.SparseArray`. Examples -------- @@ -251,7 +250,7 @@ def to_dense(self): Examples -------- - >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) >>> df.sparse.to_dense() A 0 0 @@ -323,13 +322,7 @@ def _prep_index(data, index, columns): columns = ibase.default_index(K) if len(columns) != K: - raise ValueError( - "Column length mismatch: {columns} vs. {K}".format( - columns=len(columns), K=K - ) - ) + raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") if len(index) != N: - raise ValueError( - "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) - ) + raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") return index, columns diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index e1691de234335..e2562a375515d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -34,26 +34,21 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCSeries, - ABCSparseArray, -) +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries, ABCSparseArray from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array from pandas.core.missing import interpolate_2d import pandas.core.ops as ops +from pandas.core.ops.common import unpack_zerodim_and_defer import pandas.io.formats.printing as printing -from .dtype import SparseDtype - # ---------------------------------------------------------------------------- # Array @@ -147,13 +142,13 @@ def _sparse_array_op( name = name[1:] if name in ("and", "or") and dtype == "bool": - opname = "sparse_{name}_uint8".format(name=name) + opname = f"sparse_{name}_uint8" # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: - opname = "sparse_{name}_{dtype}".format(name=name, dtype=dtype) + opname = f"sparse_{name}_{dtype}" left_sp_values = left.sp_values right_sp_values = right.sp_values @@ -264,6 +259,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): _pandas_ftype = "sparse" _subtyp = "sparse_array" # register ABCSparseArray _deprecations = PandasObject._deprecations | frozenset(["get_values"]) + _sparse_index: SparseIndex def __init__( self, @@ -367,8 +363,8 @@ def __init__( sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError( - "Non array-like type {type} must " - "have the same length as the index".format(type=type(sparse_values)) + f"Non array-like type {type(sparse_values)} must " + "have the same length as the index" ) self._sparse_index = sparse_index self._sparse_values = sparse_values @@ -377,7 +373,7 @@ def __init__( @classmethod def _simple_new( cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype - ) -> ABCSparseArray: + ) -> "SparseArray": new = cls([]) new._sparse_index = sparse_index new._sparse_values = sparse_array @@ -406,7 +402,7 @@ def from_spmatrix(cls, data): -------- >>> import scipy.sparse >>> mat = scipy.sparse.coo_matrix((4, 1)) - >>> pd.SparseArray.from_spmatrix(mat) + >>> pd.arrays.SparseArray.from_spmatrix(mat) [0.0, 0.0, 0.0, 0.0] Fill: 0.0 IntIndex @@ -415,7 +411,7 @@ def from_spmatrix(cls, data): length, ncol = data.shape if ncol != 1: - raise ValueError("'data' must have a single column, not '{}'".format(ncol)) + raise ValueError(f"'data' must have a single column, not '{ncol}'") # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. @@ -431,7 +427,7 @@ def from_spmatrix(cls, data): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None, copy=True): + def __array__(self, dtype=None, copy=True) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -571,23 +567,6 @@ def npoints(self) -> int: """ return self.sp_index.npoints - @property - def values(self): - """ - Dense values - - .. deprecated:: 0.25.0 - - Use ``np.asarray(...)`` or the ``.to_dense()`` method instead. - """ - msg = ( - "The SparseArray.values attribute is deprecated and will be " - "removed in a future version. You can use `np.asarray(...)` or " - "the `.to_dense()` method instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - return self.to_dense() - def isna(self): # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. @@ -710,11 +689,11 @@ def factorize(self, na_sentinel=-1): # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] - # Given that we have to return a dense array of labels, why bother + # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) - return labels, uniques + return codes, uniques def value_counts(self, dropna=True): """ @@ -758,6 +737,9 @@ def value_counts(self, dropna=True): # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") @@ -786,12 +768,14 @@ def __getitem__(self, key): else: key = np.asarray(key) - if com.is_bool_indexer(key) and len(self) == len(key): + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) else: - raise ValueError("Cannot slice with '{}'".format(key)) + raise ValueError(f"Cannot slice with '{key}'") return type(self)(data_slice, kind=self.kind) @@ -811,9 +795,7 @@ def _get_val_at(self, loc): def take(self, indices, allow_fill=False, fill_value=None): if is_scalar(indices): - raise ValueError( - "'indices' must be an array, not a scalar '{}'.".format(indices) - ) + raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") indices = np.asarray(indices, dtype=np.int32) if indices.size == 0: @@ -952,8 +934,8 @@ def _concat_same_type(cls, to_concat): if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn( "Concatenating sparse arrays with multiple fill " - "values: '{}'. Picking the first and " - "converting the rest.".format(fill_values), + f"values: '{fill_values}'. Picking the first and " + "converting the rest.", PerformanceWarning, stacklevel=6, ) @@ -1096,7 +1078,7 @@ def map(self, mapper): Examples -------- - >>> arr = pd.SparseArray([0, 1, 2]) + >>> arr = pd.arrays.SparseArray([0, 1, 2]) >>> arr.apply(lambda x: x + 10) [10, 11, 12] Fill: 10 @@ -1140,22 +1122,6 @@ def to_dense(self): """ return np.asarray(self, dtype=self.sp_values.dtype) - def get_values(self): - """ - Convert SparseArray to a NumPy array. - - .. deprecated:: 0.25.0 - Use `to_dense` instead. - - """ - warnings.warn( - "The 'get_values' method is deprecated and will be removed in a " - "future version. Use the 'to_dense' method instead.", - FutureWarning, - stacklevel=2, - ) - return self._internal_get_values() - _internal_get_values = to_dense # ------------------------------------------------------------------------ @@ -1189,11 +1155,7 @@ def _reduce(self, name, skipna=True, **kwargs): method = getattr(self, name, None) if method is None: - raise TypeError( - "cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype - ) - ) + raise TypeError(f"cannot perform {name} with type {self.dtype}") if skipna: arr = self @@ -1289,7 +1251,7 @@ def cumsum(self, axis=0, *args, **kwargs): nv.validate_cumsum(args, kwargs) if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. - raise ValueError("axis(={axis}) out of bounds".format(axis=axis)) + raise ValueError(f"axis(={axis}) out of bounds") if not self._null_fill_value: return SparseArray(self.to_dense()).cumsum() @@ -1396,26 +1358,22 @@ def __abs__(self): # ------------------------------------------------------------------------ @classmethod - def _create_unary_method(cls, op): - def sparse_unary_method(self): + def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]: + def sparse_unary_method(self) -> "SparseArray": fill_value = op(np.array(self.fill_value)).item() values = op(self.sp_values) dtype = SparseDtype(values.dtype, fill_value) return cls._simple_new(values, self.sp_index, dtype) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return compat.set_function_name(sparse_unary_method, name, cls) @classmethod def _create_arithmetic_method(cls, op): op_name = op.__name__ + @unpack_zerodim_and_defer(op_name) def sparse_arithmetic_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) @@ -1441,11 +1399,7 @@ def sparse_arithmetic_method(self, other): # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( - ( - "length mismatch: {self} vs. {other}".format( - self=len(self), other=len(other) - ) - ) + (f"length mismatch: {len(self)} vs. {len(other)}") ) if not isinstance(other, SparseArray): dtype = getattr(other, "dtype", None) @@ -1454,7 +1408,7 @@ def sparse_arithmetic_method(self, other): ) return _sparse_array_op(self, other, op, op_name) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return compat.set_function_name(sparse_arithmetic_method, name, cls) @classmethod @@ -1463,12 +1417,9 @@ def _create_comparison_method(cls, op): if op_name in {"and_", "or_"}: op_name = op_name[:-1] + @unpack_zerodim_and_defer(op_name) def cmp_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - if not is_scalar(other) and not isinstance(other, type(self)): # convert list-like to ndarray other = np.asarray(other) @@ -1477,9 +1428,7 @@ def cmp_method(self, other): # TODO: make this more flexible than just ndarray... if len(self) != len(other): raise AssertionError( - "length mismatch: {self} vs. {other}".format( - self=len(self), other=len(other) - ) + f"length mismatch: {len(self)} vs. {len(other)}" ) other = SparseArray(other, fill_value=self.fill_value) @@ -1497,7 +1446,7 @@ def cmp_method(self, other): dtype=np.bool_, ) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return compat.set_function_name(cmp_method, name, cls) @classmethod @@ -1515,12 +1464,11 @@ def _add_comparison_ops(cls): # ---------- # Formatting # ----------- - def __repr__(self): - return "{self}\nFill: {fill}\n{index}".format( - self=printing.pprint_thing(self), - fill=printing.pprint_thing(self.fill_value), - index=printing.pprint_thing(self.sp_index), - ) + def __repr__(self) -> str: + pp_str = printing.pprint_thing(self) + pp_fill = printing.pprint_thing(self.fill_value) + pp_index = printing.pprint_thing(self.sp_index) + return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" def _formatter(self, boxed=False): # Defer to the formatter from the GenericArrayFormatter calling us. diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index 6fd73ae14fff1..6f15681cab87e 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -1,10 +1,12 @@ """Sparse Dtype""" import re -from typing import Any +from typing import Any, Tuple import numpy as np +from pandas._typing import Dtype + from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -17,8 +19,6 @@ from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas._typing import Dtype - @register_extension_dtype class SparseDtype(ExtensionDtype): @@ -64,7 +64,7 @@ class SparseDtype(ExtensionDtype): # hash(nan) is (sometimes?) 0. _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if isinstance(dtype, type(self)): if fill_value is None: @@ -79,9 +79,7 @@ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): - raise ValueError( - "fill_value must be a scalar. Got {} instead".format(fill_value) - ) + raise ValueError(f"fill_value must be a scalar. Got {fill_value} instead") self._dtype = dtype self._fill_value = fill_value @@ -90,7 +88,7 @@ def __hash__(self): # __eq__, so we explicitly do it here. return super().__hash__() - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: # We have to override __eq__ to handle NA values in _metadata. # The base class does simple == checks, which fail for NA. if isinstance(other, str): @@ -163,14 +161,21 @@ def subtype(self): @property def name(self): - return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value) + return f"Sparse[{self.subtype.name}, {self.fill_value}]" - def __repr__(self): + def __repr__(self) -> str: return self.name @classmethod def construct_array_type(cls): - from .array import SparseArray + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + from pandas.core.arrays.sparse.array import SparseArray return SparseArray @@ -201,7 +206,7 @@ def construct_from_string(cls, string): ------- SparseDtype """ - msg = "Could not construct SparseDtype from '{}'".format(string) + msg = f"Cannot construct a 'SparseDtype' from '{string}'" if string.startswith("Sparse"): try: sub_type, has_fill_value = cls._parse_subtype(string) @@ -210,20 +215,20 @@ def construct_from_string(cls, string): else: result = SparseDtype(sub_type) msg = ( - "Could not construct SparseDtype from '{}'.\n\nIt " + f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " "looks like the fill_value in the string is not " "the default for the dtype. Non-default fill_values " "are not supported. Use the 'SparseDtype()' " "constructor instead." ) if has_fill_value and str(result) != string: - raise TypeError(msg.format(string)) + raise TypeError(msg) return result else: raise TypeError(msg) @staticmethod - def _parse_subtype(dtype): + def _parse_subtype(dtype: str) -> Tuple[str, bool]: """ Parse a string to get the subtype @@ -249,11 +254,11 @@ def _parse_subtype(dtype): has_fill_value = False if m: subtype = m.groupdict()["subtype"] - has_fill_value = m.groupdict()["fill_value"] or has_fill_value + has_fill_value = bool(m.groupdict()["fill_value"]) elif dtype == "Sparse": subtype = "float64" else: - raise ValueError("Cannot parse {}".format(dtype)) + raise ValueError(f"Cannot parse {dtype}") return subtype, has_fill_value @classmethod @@ -285,7 +290,7 @@ def update_dtype(self, dtype): Returns ------- SparseDtype - A new SparseDtype with the corret `dtype` and fill value + A new SparseDtype with the correct `dtype` and fill value for that `dtype`. Raises diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 11c27451a5801..88d63071c360f 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,9 +3,7 @@ Currently only includes to_coo helpers. """ -from collections import OrderedDict - -from pandas.core.index import Index, MultiIndex +from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series @@ -46,14 +44,13 @@ def get_indexers(levels): # labels_to_i[:] = np.arange(labels_to_i.shape[0]) def _get_label_to_i_dict(labels, sort_labels=False): - """ Return OrderedDict of unique labels to number. + """ Return dict of unique labels to number. Optionally sort by label. """ labels = Index(map(tuple, labels)).unique().tolist() # squish if sort_labels: - labels = sorted(list(labels)) - d = OrderedDict((k, i) for i, k in enumerate(labels)) - return d + labels = sorted(labels) + return {k: i for i, k in enumerate(labels)} def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) @@ -137,7 +134,7 @@ def _coo_to_sparse_series(A, dense_index: bool = False): try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) except AttributeError: - raise TypeError("Expected coo_matrix. Got {} instead.".format(type(A).__name__)) + raise TypeError(f"Expected coo_matrix. Got {type(A).__name__} instead.") s = s.sort_index() s = s.astype(SparseDtype(s.dtype)) if dense_index: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7c487b227de20..84130132de4dc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,9 +1,9 @@ import operator -from typing import TYPE_CHECKING, Type +from typing import Type import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import pandas_dtype @@ -17,9 +17,6 @@ from pandas.core.construction import extract_array from pandas.core.missing import isna -if TYPE_CHECKING: - from pandas._typing import Scalar - @register_extension_dtype class StringDtype(ExtensionDtype): @@ -50,34 +47,15 @@ class StringDtype(ExtensionDtype): StringDtype """ - @property - def na_value(self) -> "Scalar": - """ - StringDtype uses :attr:`numpy.nan` as the missing NA value. + name = "string" - .. warning:: - - `na_value` may change in a future release. - """ - return np.nan + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA @property def type(self) -> Type: return str - @property - def name(self) -> str: - """ - The alias for StringDtype is ``'string'``. - """ - return "string" - - @classmethod - def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "string": - return cls() - return super().construct_from_string(string) - @classmethod def construct_array_type(cls) -> "Type[StringArray]": return StringArray @@ -85,6 +63,24 @@ def construct_array_type(cls) -> "Type[StringArray]": def __repr__(self) -> str: return "StringDtype" + def __from_arrow__(self, array): + """Construct StringArray from passed pyarrow Array/ChunkedArray""" + import pyarrow + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + # using _from_sequence to ensure None is converted to NA + str_arr = StringArray._from_sequence(np.array(arr)) + results.append(str_arr) + + return StringArray._concat_same_type(results) + class StringArray(PandasArray): """ @@ -127,11 +123,15 @@ class StringArray(PandasArray): The string methods are available on Series backed by a StringArray. + Notes + ----- + StringArray returns a BooleanArray for comparison methods. + Examples -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', nan, 'data.'] + ['This is', 'some text', , 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string @@ -141,6 +141,13 @@ class StringArray(PandasArray): Traceback (most recent call last): ... ValueError: StringArray requires an object-dtype ndarray of strings. + + For comparison methods, this returns a :class:`pandas.BooleanArray` + + >>> pd.array(["a", None, "c"], dtype="string") == "a" + + [True, , False] + Length: 3, dtype: boolean """ # undo the PandasArray hack @@ -164,7 +171,7 @@ def _validate(self): if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings. Got " - "'{}' dtype instead.".format(self._ndarray.dtype) + f"'{self._ndarray.dtype}' dtype instead." ) @classmethod @@ -172,10 +179,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" result = super()._from_sequence(scalars, dtype=object, copy=copy) - # convert None to np.nan + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = np.nan + result[result.isna()] = StringDtype.na_value return result @classmethod @@ -190,7 +197,16 @@ def __arrow_array__(self, type=None): if type is None: type = pa.string() - return pa.array(self._ndarray, type=type, from_pandas=True) + + values = self._ndarray.copy() + values[self.isna()] = None + return pa.array(values, type=type, from_pandas=True) + + def _values_for_factorize(self): + arr = self._ndarray.copy() + mask = self.isna() + arr[mask] = -1 + return arr, -1 def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) @@ -205,11 +221,11 @@ def __setitem__(self, key, value): # validate new items if scalar_value: - if scalar_value is None: - value = np.nan - elif not (isinstance(value, str) or np.isnan(value)): + if isna(value): + value = StringDtype.na_value + elif not isinstance(value, str): raise ValueError( - "Cannot set non-string value '{}' into a StringArray.".format(value) + f"Cannot set non-string value '{value}' into a StringArray." ) else: if not is_array_like(value): @@ -232,17 +248,22 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) def _reduce(self, name, skipna=True, **kwargs): - raise TypeError("Cannot perform reduction '{}' with string dtype".format(name)) + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + return value_counts(self._ndarray, dropna=dropna).astype("Int64") # Overrride parent because we have different return types. @classmethod def _create_arithmetic_method(cls, op): + # Note: this handles both arithmetic and comparison methods. def method(self, other): + from pandas.arrays import BooleanArray + + assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): return NotImplemented @@ -256,25 +277,24 @@ def method(self, other): if len(other) != len(self): # prevent improper broadcasting when other is 2D raise ValueError( - "Lengths of operands do not match: {} != {}".format( - len(self), len(other) - ) + f"Lengths of operands do not match: {len(self)} != {len(other)}" ) other = np.asarray(other) other = other[valid] - result = np.empty_like(self._ndarray, dtype="object") - result[mask] = np.nan - result[valid] = op(self._ndarray[valid], other) - - if op.__name__ in {"add", "radd", "mul", "rmul"}: + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = StringDtype.na_value + result[valid] = op(self._ndarray[valid], other) return StringArray(result) else: - dtype = "object" if mask.any() else "bool" - return np.asarray(result, dtype=dtype) + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) - return compat.set_function_name(method, "__{}__".format(op.__name__), cls) + return compat.set_function_name(method, f"__{op.__name__}__", cls) @classmethod def _add_arithmetic_ops(cls): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 21e07b5101a64..c34d14f15075c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,7 +1,5 @@ from datetime import timedelta -import textwrap from typing import List -import warnings import numpy as np @@ -13,19 +11,14 @@ parse_timedelta_unit, precision_from_unit, ) -import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, - ensure_int64, - is_datetime64_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -44,14 +37,12 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick -from . import datetimelike as dtl - _BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" @@ -71,60 +62,10 @@ def f(self): return result f.__name__ = name - f.__doc__ = "\n{}\n".format(docstring) + f.__doc__ = f"\n{docstring}\n" return property(f) -def _td_array_cmp(cls, op): - """ - Wrap comparison operations to convert timedelta-like to timedelta64 - """ - opname = "__{name}__".format(name=op.__name__) - nat_result = opname == "__ne__" - - def wrapper(self, other): - other = lib.item_from_zerodim(other) - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - return NotImplemented - - if _is_convertible_to_td(other) or other is NaT: - try: - other = Timedelta(other) - except ValueError: - # failed to parse as timedelta - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.value) - if isna(other): - result.fill(nat_result) - - elif not is_list_like(other): - return invalid_comparison(self, other, op) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = type(self)._from_sequence(other)._data - except (ValueError, TypeError): - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.view("i8")) - result = com.values_from_object(result) - - o_mask = np.array(isna(other)) - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. @@ -160,10 +101,13 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" _scalar_type = Timedelta + _recognized_scalars = (timedelta, np.timedelta64, Tick) + _is_recognized_dtype = is_timedelta64_dtype + __array_priority__ = 1000 # define my properties & methods for delegation - _other_ops = [] # type: List[str] - _bool_ops = [] # type: List[str] + _other_ops: List[str] = [] + _bool_ops: List[str] = [] _object_ops = ["freq"] _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops @@ -218,11 +162,11 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if not isinstance(values, np.ndarray): msg = ( - "Unexpected type '{}'. 'values' must be a TimedeltaArray " - "ndarray, or Series or Index containing one of those." + f"Unexpected type '{type(values).__name__}'. 'values' must be a " + "TimedeltaArray ndarray, or Series or Index containing one of those." ) - raise ValueError(msg.format(type(values).__name__)) - if values.ndim != 1: + raise ValueError(msg) + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -337,7 +281,7 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timedelta(value) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass @@ -347,19 +291,6 @@ def _maybe_clear_freq(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): - fill_value = Timedelta(fill_value).value - else: - raise ValueError( - "'fill_value' should be a Timedelta. " - "Got '{got}'.".format(got=fill_value) - ) - return fill_value - def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] @@ -386,6 +317,9 @@ def astype(self, dtype, copy=True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + # ---------------------------------------------------------------- + # Reductions + def sum( self, axis=None, @@ -450,7 +384,7 @@ def _formatter(self, boxed=False): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) @@ -459,14 +393,10 @@ def _format_native_types(self, na_rep="NaT", date_format=None): # ---------------------------------------------------------------- # Arithmetic Methods - _create_comparison_method = classmethod(_td_array_cmp) - def _add_offset(self, other): assert not isinstance(other, Tick) raise TypeError( - "cannot add the type {typ} to a {cls}".format( - typ=type(other).__name__, cls=type(self).__name__ - ) + f"cannot add the type {type(other).__name__} to a {type(self).__name__}" ) def _add_delta(self, delta): @@ -517,18 +447,16 @@ def _add_datetimelike_scalar(self, other): dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) - def _addsub_offset_array(self, other, op): - # Add or subtract Array-like of DateOffset objects + def _addsub_object_array(self, other, op): + # Add or subtract Array-like of objects try: # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - return super()._addsub_offset_array(other, op) + return super()._addsub_object_array(other, op) except AttributeError: raise TypeError( - "Cannot add/subtract non-tick DateOffset to {cls}".format( - cls=type(self).__name__ - ) + f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" ) def __mul__(self, other): @@ -637,9 +565,7 @@ def __rtruediv__(self, other): elif lib.is_scalar(other): raise TypeError( - "Cannot divide {typ} by {cls}".format( - typ=type(other).__name__, cls=type(self).__name__ - ) + f"Cannot divide {type(other).__name__} by {type(self).__name__}" ) if not hasattr(other, "dtype"): @@ -662,9 +588,7 @@ def __rtruediv__(self, other): else: raise TypeError( - "Cannot divide {dtype} data by {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) + f"Cannot divide {other.dtype} data by {type(self).__name__}" ) def __floordiv__(self, other): @@ -727,11 +651,7 @@ def __floordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError( - "Cannot divide {typ} by {cls}".format( - typ=dtype, cls=type(self).__name__ - ) - ) + raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}") def __rfloordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): @@ -752,9 +672,7 @@ def __rfloordiv__(self, other): return result raise TypeError( - "Cannot divide {typ} by {cls}".format( - typ=type(other).__name__, cls=type(self).__name__ - ) + f"Cannot divide {type(other).__name__} by {type(self).__name__}" ) if not hasattr(other, "dtype"): @@ -782,11 +700,7 @@ def __rfloordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError( - "Cannot divide {typ} by {cls}".format( - typ=dtype, cls=type(self).__name__ - ) - ) + raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}") def __mod__(self, other): # Note: This is a naive implementation, can likely be optimized @@ -974,9 +888,6 @@ def f(x): return result -TimedeltaArray._add_comparison_ops() - - # --------------------------------------------------------------------- # Constructor Helpers @@ -1057,27 +968,11 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): data = data.astype(_TD_DTYPE) copy = False - elif is_datetime64_dtype(data): - # GH#23539 - warnings.warn( - "Passing datetime64-dtype data to TimedeltaIndex is " - "deprecated, will raise a TypeError in a future " - "version", - FutureWarning, - stacklevel=4, - ) - data = ensure_int64(data).view(_TD_DTYPE) - else: - raise TypeError( - "dtype {dtype} cannot be converted to timedelta64[ns]".format( - dtype=data.dtype - ) - ) + # This includes datetime64-dtype, see GH#23539, GH#29794 + raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) - if data.ndim != 1: - raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == "m8[ns]", data return data, inferred_freq @@ -1109,7 +1004,7 @@ def ints_to_td64ns(data, unit="ns"): copy_made = True if unit != "ns": - dtype_str = "timedelta64[{unit}]".format(unit=unit) + dtype_str = f"timedelta64[{unit}]" data = data.view(dtype_str) # TODO: watch out for overflows when converting from lower-resolution @@ -1161,14 +1056,12 @@ def objects_to_td64ns(data, unit="ns", errors="raise"): def _validate_td64_dtype(dtype): dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, np.dtype("timedelta64")): - dtype = _TD_DTYPE - msg = textwrap.dedent( - """\ - Passing in 'timedelta' dtype with no precision is deprecated - and will raise in a future version. Please pass in - 'timedelta64[ns]' instead.""" + # no precision disallowed GH#24806 + msg = ( + "Passing in 'timedelta' dtype with no precision is not allowed. " + "Please pass in 'timedelta64[ns]' instead." ) - warnings.warn(msg, FutureWarning, stacklevel=4) + raise ValueError(msg) if not is_dtype_equal(dtype, _TD_DTYPE): raise ValueError(_BAD_DTYPE.format(dtype=dtype)) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9586d49c555ff..66d7cd59dcfa4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -2,14 +2,13 @@ Base and utility classes for pandas objects. """ import builtins -from collections import OrderedDict import textwrap -from typing import Dict, FrozenSet, Optional -import warnings +from typing import Dict, FrozenSet, List, Optional import numpy as np import pandas._libs.lib as lib +from pandas._typing import T from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -20,14 +19,13 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, - is_datetimelike, + is_dict_like, is_extension_array_dtype, - is_extension_type, is_list_like, is_object_dtype, is_scalar, is_timedelta64_ns_dtype, + needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -36,9 +34,10 @@ from pandas.core.accessor import DirNamesMixin from pandas.core.algorithms import duplicated, unique1d, value_counts from pandas.core.arrays import ExtensionArray +from pandas.core.construction import create_series_with_explicit_dtype import pandas.core.nanops as nanops -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _indexops_doc_kwargs = dict( klass="IndexOpsMixin", inplace="", @@ -53,9 +52,9 @@ class PandasObject(DirNamesMixin): @property def _constructor(self): """class constructor (for this class it's just `__class__`""" - return self.__class__ + return type(self) - def __repr__(self): + def __repr__(self) -> str: """ Return a string representation for a particular object. """ @@ -88,13 +87,21 @@ def __sizeof__(self): # object's 'sizeof' return super().__sizeof__() + def _ensure_type(self: T, obj) -> T: + """Ensure that an object has same type as self. + + Used by type checkers. + """ + assert isinstance(obj, type(self)), type(obj) + return obj + class NoNewAttributesMixin: """Mixin which prevents adding new attributes. Prevents additional attributes via xxx.attribute = "something" after a call to `self.__freeze()`. Mainly used to prevent the user from using - wrong attributes on a accessor (`Series.cat/.str/.dt`). + wrong attributes on an accessor (`Series.cat/.str/.dt`). If you really want to add a new attribute at a later time, you need to use `object.__setattr__(self, key, value)`. @@ -116,9 +123,7 @@ def __setattr__(self, key, value): or key in type(self).__dict__ or getattr(self, key, None) is not None ): - raise AttributeError( - "You cannot add any new attribute '{key}'".format(key=key) - ) + raise AttributeError(f"You cannot add any new attribute '{key}'") object.__setattr__(self, key, value) @@ -144,39 +149,35 @@ class SelectionMixin: _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) - _builtin_table = OrderedDict( - ((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min)) - ) - - _cython_table = OrderedDict( - ( - (builtins.sum, "sum"), - (builtins.max, "max"), - (builtins.min, "min"), - (np.all, "all"), - (np.any, "any"), - (np.sum, "sum"), - (np.nansum, "sum"), - (np.mean, "mean"), - (np.nanmean, "mean"), - (np.prod, "prod"), - (np.nanprod, "prod"), - (np.std, "std"), - (np.nanstd, "std"), - (np.var, "var"), - (np.nanvar, "var"), - (np.median, "median"), - (np.nanmedian, "median"), - (np.max, "max"), - (np.nanmax, "max"), - (np.min, "min"), - (np.nanmin, "min"), - (np.cumprod, "cumprod"), - (np.nancumprod, "cumprod"), - (np.cumsum, "cumsum"), - (np.nancumsum, "cumsum"), - ) - ) + _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min} + + _cython_table = { + builtins.sum: "sum", + builtins.max: "max", + builtins.min: "min", + np.all: "all", + np.any: "any", + np.sum: "sum", + np.nansum: "sum", + np.mean: "mean", + np.nanmean: "mean", + np.prod: "prod", + np.nanprod: "prod", + np.std: "std", + np.nanstd: "std", + np.var: "var", + np.nanvar: "var", + np.median: "median", + np.nanmedian: "median", + np.max: "max", + np.nanmax: "max", + np.min: "min", + np.nanmin: "min", + np.cumprod: "cumprod", + np.nancumprod: "cumprod", + np.cumsum: "cumsum", + np.nancumsum: "cumsum", + } @property def _selection_name(self): @@ -207,7 +208,7 @@ def _selected_obj(self): return self.obj[self._selection] @cache_readonly - def ndim(self): + def ndim(self) -> int: return self._selected_obj.ndim @cache_readonly @@ -222,28 +223,22 @@ def _obj_with_exclusions(self): def __getitem__(self, key): if self._selection is not None: - raise IndexError( - "Column(s) {selection} already selected".format( - selection=self._selection - ) - ) + raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) - raise KeyError( - "Columns not found: {missing}".format(missing=str(bad_keys)[1:-1]) - ) + raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): if key not in self.obj.columns: - raise KeyError("Column not found: {key}".format(key=key)) + raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: if key not in self.obj: - raise KeyError("Column not found: {key}".format(key=key)) + raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) def _gotitem(self, key, ndim, subset=None): @@ -285,9 +280,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): # people may try to aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert ( - len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 - ) + assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0 return f f = getattr(np, arg, None) @@ -297,8 +290,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): return f(self, *args, **kwargs) raise AttributeError( - "'{arg}' is not a valid function for " - "'{cls}' object".format(arg=arg, cls=type(self).__name__) + f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) def _aggregate(self, arg, *args, **kwargs): @@ -321,44 +313,26 @@ def _aggregate(self, arg, *args, **kwargs): None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - is_nested_renamer = False _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(self, "axis", 0) - _level = kwargs.pop("_level", None) if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): - # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj - def nested_renaming_depr(level=4): - # deprecation of nested renaming - # GH 15931 - msg = textwrap.dedent( - """\ - using a dict with renaming is deprecated and will be removed - in a future version. - - For column-specific groupby renaming, use named aggregation - - >>> df.groupby(...).agg(name=('column', aggfunc)) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=level) - # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): - new_arg = OrderedDict() + new_arg = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] @@ -376,20 +350,11 @@ def nested_renaming_depr(level=4): # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): - is_nested_renamer = True - - if k not in obj.columns: - msg = ( - "cannot perform renaming for {key} with a " - "nested dictionary" - ).format(key=k) - raise SpecificationError(msg) - nested_renaming_depr(4 + (_level or 0)) - + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCSeries): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCDataFrame) and k not in obj.columns: - raise KeyError("Column '{col}' does not exist!".format(col=k)) + raise KeyError(f"Column '{k}' does not exist!") arg = new_arg @@ -400,7 +365,7 @@ def nested_renaming_depr(level=4): if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): - nested_renaming_depr() + raise SpecificationError("nested renamer is not supported") from pandas.core.reshape.concat import concat @@ -413,47 +378,30 @@ def _agg_1dim(name, how, subset=None): raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) - return colg.aggregate(how, _level=(_level or 0) + 1) + return colg.aggregate(how) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) - return colg.aggregate(how, _level=None) + return colg.aggregate(how) def _agg(arg, func): """ run the aggregations over the arg with func - return an OrderedDict + return a dict """ - result = OrderedDict() + result = {} for fname, agg_how in arg.items(): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(arg.keys()) - result = OrderedDict() - - # nested renamer - if is_nested_renamer: - result = list(_agg(arg, _agg_1dim).values()) - - if all(isinstance(r, dict) for r in result): - - result, results = OrderedDict(), result - for r in results: - result.update(r) - keys = list(result.keys()) - - else: + result = {} - if self._selection is not None: - keys = None - - # some selection on the object - elif self._selection is not None: + if self._selection is not None: sl = set(self._selection_list) @@ -488,11 +436,11 @@ def _agg(arg, func): # combine results - def is_any_series(): + def is_any_series() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in result.values()) - def is_any_frame(): + def is_any_frame() -> bool: # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in result.values()) @@ -537,7 +485,7 @@ def is_any_frame(): return result, True elif is_list_like(arg): # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None + return self._aggregate_multiple_funcs(arg, _axis=_axis), None else: result = None @@ -548,7 +496,7 @@ def is_any_frame(): # caller can react return result, True - def _aggregate_multiple_funcs(self, arg, _level, _axis): + def _aggregate_multiple_funcs(self, arg, _axis): from pandas.core.reshape.concat import concat if _axis != 0: @@ -569,7 +517,7 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): try: new_res = colg.aggregate(a) - except (TypeError, DataError): + except TypeError: pass else: results.append(new_res) @@ -586,9 +534,16 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): new_res = colg.aggregate(arg) except (TypeError, DataError): pass - except ValueError: + except ValueError as err: # cannot aggregate - continue + if "Must produce aggregated value" in str(err): + # raised directly in _aggregate_named + pass + elif "no results" in str(err): + # raised direcly in _aggregate_multiple_funcs + pass + else: + raise else: results.append(new_res) keys.append(col) @@ -611,21 +566,6 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): raise ValueError("cannot combine transform and aggregation operations") return result - def _shallow_copy(self, obj=None, obj_type=None, **kwargs): - """ - return a new object with the replacement attributes - """ - if obj is None: - obj = self._selected_obj.copy() - if obj_type is None: - obj_type = self._constructor - if isinstance(obj, obj_type): - obj = obj.obj - for attr in self._attributes: - if attr not in kwargs: - kwargs[attr] = getattr(self, attr) - return obj_type(obj, **kwargs) - def _get_cython_func(self, arg: str) -> Optional[str]: """ if we define an internal function for this argument, return it @@ -640,6 +580,24 @@ def _is_builtin_func(self, arg): return self._builtin_table.get(arg, arg) +class ShallowMixin: + _attributes: List[str] = [] + + def _shallow_copy(self, obj=None, **kwargs): + """ + return a new object with the replacement attributes + """ + if obj is None: + obj = self._selected_obj.copy() + + if isinstance(obj, self._constructor): + obj = obj.obj + for attr in self._attributes: + if attr not in kwargs: + kwargs[attr] = getattr(self, attr) + return self._constructor(obj, **kwargs) + + class IndexOpsMixin: """ Common ops mixin to support a unified interface / docs for Series / Index @@ -647,17 +605,9 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 - _deprecations = frozenset( - [ - "tolist", # tolist is not deprecated, just suppressed in the __dir__ - "base", - "data", - "item", - "itemsize", - "flags", - "strides", - ] - ) # type: FrozenSet[str] + _deprecations: FrozenSet[str] = frozenset( + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ + ) def transpose(self, *args, **kwargs): """ @@ -677,24 +627,6 @@ def transpose(self, *args, **kwargs): """, ) - @property - def _is_homogeneous_type(self): - """ - Whether the object has a single dtype. - - By definition, Series and Index are always considered homogeneous. - A MultiIndex may or may not be homogeneous, depending on the - dtypes of the levels. - - See Also - -------- - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - MultiIndex._is_homogeneous_type : Whether all the levels of a - MultiIndex have the same dtype. - """ - return True - @property def shape(self): """ @@ -703,7 +635,7 @@ def shape(self): return self._values.shape @property - def ndim(self): + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. """ @@ -713,49 +645,27 @@ def item(self): """ Return the first element of the underlying data as a python scalar. - .. deprecated:: 0.25.0 - Returns ------- scalar The first element of %(klass)s. - """ - warnings.warn( - "`item` has been deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - return self.values.item() - - @property - def data(self): - """ - Return the data pointer of the underlying data. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self.values.data - @property - def itemsize(self): + Raises + ------ + ValueError + If the data is not length-1. """ - Return the size of the dtype of the item of the underlying data. + if not ( + is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype) + ): + # numpy returns ints instead of datetime64/timedelta64 objects, + # which we need to wrap in Timestamp/Timedelta/Period regardless. + return self.values.item() - .. deprecated:: 0.23.0 - """ - warnings.warn( - "{obj}.itemsize is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self._ndarray_values.itemsize + if len(self) == 1: + return next(iter(self)) + else: + raise ValueError("can only convert an array of size 1 to a Python scalar") @property def nbytes(self): @@ -764,21 +674,6 @@ def nbytes(self): """ return self._values.nbytes - @property - def strides(self): - """ - Return the strides of the underlying data. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "{obj}.strides is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self._ndarray_values.strides - @property def size(self): """ @@ -786,36 +681,6 @@ def size(self): """ return len(self._values) - @property - def flags(self): - """ - Return the ndarray.flags for the underlying data. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self.values.flags - - @property - def base(self): - """ - Return the base object if the memory of the underlying data is shared. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self.values.base - @property def array(self) -> ExtensionArray: """ @@ -850,6 +715,8 @@ def array(self) -> ExtensionArray: period PeriodArray interval IntervalArray IntegerNA IntegerArray + string StringArray + boolean BooleanArray datetime64[ns, tz] DatetimeArray ================== ============================= @@ -901,7 +768,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -916,6 +783,17 @@ def to_numpy(self, dtype=None, copy=False): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 Returns ------- @@ -985,16 +863,21 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if is_datetime64tz_dtype(self.dtype) and dtype is None: - # note: this is going to change very soon. - # I have a WIP PR making this unnecessary, but it's - # a bit out of scope for the DatetimeArray PR. - dtype = "object" + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy - if copy: + if copy or na_value is not lib.no_default: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value return result @property @@ -1066,7 +949,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- axis : {None} - Dummy argument for consistency with Series + Dummy argument for consistency with Series. skipna : bool, default True Returns @@ -1089,7 +972,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- axis : {None} - Dummy argument for consistency with Series + Dummy argument for consistency with Series. skipna : bool, default True Returns @@ -1130,7 +1013,7 @@ def argmin(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- axis : {None} - Dummy argument for consistency with Series + Dummy argument for consistency with Series. skipna : bool, default True Returns @@ -1161,7 +1044,7 @@ def tolist(self): -------- numpy.ndarray.tolist """ - if is_datetimelike(self._values): + if self.dtype.kind in ["m", "M"]: return [com.maybe_box_datetimelike(x) for x in self._values] elif is_extension_array_dtype(self._values): return list(self._values) @@ -1183,7 +1066,7 @@ def __iter__(self): iterator """ # We are explicitly making element iterators. - if is_datetimelike(self._values): + if self.dtype.kind in ["m", "M"]: return map(com.maybe_box_datetimelike, self._values) elif is_extension_array_dtype(self._values): return iter(self._values) @@ -1204,9 +1087,7 @@ def _reduce( func = getattr(self, name, None) if func is None: raise TypeError( - "{klass} cannot perform the operation {op}".format( - klass=self.__class__.__name__, op=name - ) + f"{type(self).__name__} cannot perform the operation {name}" ) return func(skipna=skipna, **kwds) @@ -1235,8 +1116,8 @@ def _map_values(self, mapper, na_action=None): # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types - if isinstance(mapper, dict): - if hasattr(mapper, "__missing__"): + if is_dict_like(mapper): + if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper @@ -1246,9 +1127,14 @@ def _map_values(self, mapper, na_action=None): # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples - from pandas import Series - mapper = Series(mapper) + # The return value of mapping with an empty mapper is + # expected to be pd.Series(np.nan, ...). As np.nan is + # of dtype float64 the return value of this method should + # be float64 as well + mapper = create_series_with_explicit_dtype( + mapper, dtype_if_empty=np.float64 + ) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either @@ -1257,7 +1143,7 @@ def _map_values(self, mapper, na_action=None): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) - if is_extension_type(self.dtype): + if is_extension_array_dtype(self.dtype): values = self._values else: values = self.values @@ -1268,7 +1154,8 @@ def _map_values(self, mapper, na_action=None): return new_values # we must convert to python types - if is_extension_type(self.dtype): + if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): + # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError @@ -1458,7 +1345,7 @@ def is_monotonic(self): is_monotonic_increasing = is_monotonic @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return boolean if values in the object are monotonic_decreasing. @@ -1479,7 +1366,7 @@ def memory_usage(self, deep=False): ---------- deep : bool Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption + `object` dtypes for system-level memory consumption. Returns ------- @@ -1509,7 +1396,7 @@ def memory_usage(self, deep=False): sort=textwrap.dedent( """\ sort : bool, default False - Sort `uniques` and shuffle `labels` to maintain the + Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), diff --git a/pandas/core/common.py b/pandas/core/common.py index 565f5076fdddb..f0fcb736586d6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,16 +5,16 @@ """ import collections -from collections import OrderedDict, abc +from collections import abc from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Iterable, Union +from typing import Any, Collection, Iterable, Union import numpy as np from pandas._libs import lib, tslibs -from pandas.compat import PY36 +from pandas._typing import T from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -111,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool: Returns ------- bool + Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. + + See Also + -------- + check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarray. """ - na_msg = "cannot index with vector containing NA / NaN values" + na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -215,16 +221,6 @@ def try_sort(iterable): return listed -def dict_keys_to_ordered_list(mapping): - # when pandas drops support for Python < 3.6, this function - # can be replaced by a simple list(mapping.keys()) - if PY36 or isinstance(mapping, OrderedDict): - keys = list(mapping.keys()) - else: - keys = try_sort(mapping) - return keys - - def asarray_tuplesafe(values, dtype=None): if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): @@ -281,7 +277,7 @@ def maybe_make_list(obj): return obj -def maybe_iterable_to_list(obj: Union[Iterable, Any]) -> Union[list, Any]: +def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T]: """ If obj is Iterable but not list-like, consume into list. """ @@ -328,7 +324,7 @@ def get_callable_name(obj): return get_callable_name(obj.func) # fall back to class name if hasattr(obj, "__call__"): - return obj.__class__.__name__ + return type(obj).__name__ # everything failed (probably because the argument # wasn't actually callable); we return None # instead of the empty string in this case to allow @@ -399,7 +395,7 @@ def standardize_mapping(into): return partial(collections.defaultdict, into.default_factory) into = type(into) if not issubclass(into, abc.Mapping): - raise TypeError("unsupported type: {into}".format(into=into)) + raise TypeError(f"unsupported type: {into}") elif into == collections.defaultdict: raise TypeError("to_dict() only accepts initialized defaultdicts") return into @@ -462,7 +458,7 @@ def pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - msg = "%s is both the pipe target and a keyword argument" % target + msg = f"{target} is both the pipe target and a keyword argument" raise ValueError(msg) kwargs[target] = obj return func(*args, **kwargs) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 3e1e5ed89d877..a1b1cffdd1d76 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -2,39 +2,49 @@ """ from functools import partial, wraps +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np +from pandas._typing import FrameOrSeries from pandas.errors import PerformanceWarning -import pandas as pd +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.computation.common import _result_type_many +from pandas.core.computation.common import result_type_many + + +def _align_core_single_unary_op( + term, +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: + typ: Union[partial, Type[FrameOrSeries]] + axes: Optional[Dict[str, int]] = None -def _align_core_single_unary_op(term): if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = (typ,) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) - if not hasattr(term.value, "axes"): - ret += (None,) - else: - ret += (_zip_axes_from_type(typ, term.value.axes),) - return ret + return typ, axes -def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} +def _zip_axes_from_type( + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()} return axes -def _any_pandas_objects(terms): - """Check a sequence of terms for instances of PandasObject.""" +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ return any(isinstance(term.value, PandasObject) for term in terms) @@ -49,7 +59,7 @@ def wrapper(terms): # we don't have any pandas objects if not _any_pandas_objects(terms): - return _result_type_many(*term_values), None + return result_type_many(*term_values), None return f(terms) @@ -60,7 +70,10 @@ def wrapper(terms): def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value @@ -70,7 +83,7 @@ def _align_core(terms): gt_than_one_axis = naxes > 1 for value in (terms[i].value for i in term_index): - is_series = isinstance(value, pd.Series) + is_series = isinstance(value, ABCSeries) is_series_and_gt_one_axis = is_series and gt_than_one_axis for axis, items in enumerate(value.axes): @@ -87,7 +100,7 @@ def _align_core(terms): ti = terms[i].value if hasattr(ti, "reindex"): - transpose = isinstance(ti, pd.Series) and naxes > 1 + transpose = isinstance(ti, ABCSeries) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) @@ -96,10 +109,10 @@ def _align_core(terms): ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: w = ( - "Alignment difference on axis {axis} is larger " - "than an order of magnitude on term {term!r}, by " - "more than {ordm:.4g}; performance may suffer" - ).format(axis=axis, term=terms[i].name, ordm=ordm) + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer" + ) warnings.warn(w, category=PerformanceWarning, stacklevel=6) f = partial(ti.reindex, reindexer, axis=axis, copy=False) @@ -111,28 +124,30 @@ def _align_core(terms): return typ, _zip_axes_from_type(typ, axes) -def _align(terms): - """Align a set of terms""" +def align_terms(terms): + """ + Align a set of terms. + """ try: # flatten the parse tree (a nested list, really) terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable - if isinstance(terms.value, pd.core.generic.NDFrame): + if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return _result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None # perform the main alignment typ, axes = _align_core(terms) return typ, axes -def _reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index bd32c8bee1cdf..19a8898a2987c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,20 +4,21 @@ from pandas._config import get_option -# A token value Python's tokenizer probably will never use. -_BACKTICK_QUOTED_STRING = 100 - def _ensure_decoded(s): - """ if we have bytes, decode them to unicode """ + """ + If we have bytes, decode them to unicode. + """ if isinstance(s, (np.bytes_, bytes)): s = s.decode(get_option("display.encoding")) return s -def _result_type_many(*arrays_and_dtypes): - """ wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) - argument limit """ +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ try: return np.result_type(*arrays_and_dtypes) except ValueError: @@ -25,14 +26,5 @@ def _result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _remove_spaces_column_name(name): - """Check if name contains any spaces, if it contains any spaces - the spaces will be removed and an underscore suffix is added.""" - if not isinstance(name, str) or " " not in name: - return name - - return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" - - class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dc6378e83d229..9c5388faae1bd 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -3,9 +3,10 @@ """ import abc +from typing import Dict, Type -from pandas.core.computation.align import _align, _reconstruct_object -from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions +from pandas.core.computation.align import align_terms, reconstruct_object +from pandas.core.computation.ops import _mathops, _reductions import pandas.io.formats.printing as printing @@ -31,8 +32,7 @@ def _check_ne_builtin_clash(expr): if overlap: s = ", ".join(repr(x) for x in overlap) raise NumExprClobberingError( - 'Variables in expression "{expr}" ' - "overlap with builtins: ({s})".format(expr=expr, s=s) + f'Variables in expression "{expr}" overlap with builtins: ({s})' ) @@ -46,14 +46,15 @@ def __init__(self, expr): self.aligned_axes = None self.result_type = None - def convert(self): - """Convert an expression for evaluation. + def convert(self) -> str: + """ + Convert an expression for evaluation. Defaults to return the expression as a string. """ return printing.pprint_thing(self.expr) - def evaluate(self): + def evaluate(self) -> object: """ Run the engine on the expression. @@ -62,20 +63,20 @@ def evaluate(self): Returns ------- - obj : object + object The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms) + self.result_type, self.aligned_axes = align_terms(self.expr.terms) # make sure no names in resolvers and locals/globals clash res = self._evaluate() - return _reconstruct_object( + return reconstruct_object( self.result_type, res, self.aligned_axes, self.expr.terms.return_type ) @property - def _is_aligned(self): + def _is_aligned(self) -> bool: return self.aligned_axes is not None and self.result_type is not None @abc.abstractmethod @@ -101,31 +102,16 @@ class NumExprEngine(AbstractEngine): has_neg_frac = True - def __init__(self, expr): - super().__init__(expr) - - def convert(self): - return str(super().convert()) - def _evaluate(self): import numexpr as ne # convert the expression to a valid numexpr expression s = self.convert() - try: - env = self.expr.env - scope = env.full_scope - truediv = scope["truediv"] - _check_ne_builtin_clash(self.expr) - return ne.evaluate(s, local_dict=scope, truediv=truediv) - except KeyError as e: - # python 3 compat kludge - try: - msg = e.message - except AttributeError: - msg = str(e) - raise UndefinedVariableError(msg) + env = self.expr.env + scope = env.full_scope + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope) class PythonEngine(AbstractEngine): @@ -137,14 +123,14 @@ class PythonEngine(AbstractEngine): has_neg_frac = False - def __init__(self, expr): - super().__init__(expr) - def evaluate(self): return self.expr() - def _evaluate(self): + def _evaluate(self) -> None: pass -_engines = {"numexpr": NumExprEngine, "python": PythonEngine} +_engines: Dict[str, Type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 430daa4708001..51892b8c02d87 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -5,17 +5,21 @@ """ import tokenize +from typing import Optional import warnings +from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.scope import _ensure_scope +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string +from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing -def _check_engine(engine): +def _check_engine(engine: Optional[str]) -> str: """ Make sure a valid engine is passed. @@ -45,8 +49,7 @@ def _check_engine(engine): if engine not in _engines: valid = list(_engines.keys()) raise KeyError( - "Invalid engine {engine!r} passed, valid engines are" - " {valid}".format(engine=engine, valid=valid) + f"Invalid engine {repr(engine)} passed, valid engines are {valid}" ) # TODO: validate this in a more general way (thinking of future engines @@ -64,7 +67,7 @@ def _check_engine(engine): return engine -def _check_parser(parser): +def _check_parser(parser: str): """ Make sure a valid parser is passed. @@ -77,12 +80,11 @@ def _check_parser(parser): KeyError * If an invalid parser is passed """ - from pandas.core.computation.expr import _parsers if parser not in _parsers: raise KeyError( - "Invalid parser {parser!r} passed, valid parsers are" - " {valid}".format(parser=parser, valid=_parsers.keys()) + f"Invalid parser {repr(parser)} passed, " + f"valid parsers are {_parsers.keys()}" ) @@ -92,8 +94,8 @@ def _check_resolvers(resolvers): if not hasattr(resolver, "__getitem__"): name = type(resolver).__name__ raise TypeError( - "Resolver of type {name!r} does not implement " - "the __getitem__ method".format(name=name) + f"Resolver of type {repr(name)} does not " + f"implement the __getitem__ method" ) @@ -115,7 +117,7 @@ def _check_expression(expr): raise ValueError("expr cannot be an empty string") -def _convert_expression(expr): +def _convert_expression(expr) -> str: """ Convert an object to an expression. @@ -131,7 +133,7 @@ def _convert_expression(expr): Returns ------- - s : unicode + str The string representation of an object. Raises @@ -144,8 +146,7 @@ def _convert_expression(expr): return s -def _check_for_locals(expr, stack_level, parser): - from pandas.core.computation.expr import tokenize_string +def _check_for_locals(expr: str, stack_level: int, parser: str): at_top_of_stack = stack_level == 0 not_pandas_parser = parser != "pandas" @@ -169,8 +170,8 @@ def _check_for_locals(expr, stack_level, parser): def eval( expr, parser="pandas", - engine=None, - truediv=True, + engine: Optional[str] = None, + truediv=no_default, local_dict=None, global_dict=None, resolvers=(), @@ -192,7 +193,7 @@ def eval( Parameters ---------- - expr : str or unicode + expr : str The expression to evaluate. This string cannot contain any Python `statements `__, @@ -219,7 +220,9 @@ def eval( More backends may be available in the future. truediv : bool, optional - Whether to use true division, like in Python >= 3 + Whether to use true division, like in Python >= 3. + deprecated:: 1.0.0 + local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional @@ -282,10 +285,17 @@ def eval( See the :ref:`enhancing performance ` documentation for more details. """ - from pandas.core.computation.expr import Expr inplace = validate_bool_kwarg(inplace, "inplace") + if truediv is not no_default: + warnings.warn( + "The `truediv` parameter in pd.eval is deprecated and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) + if isinstance(expr, str): _check_expression(expr) exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] @@ -298,6 +308,9 @@ def eval( "multi-line expressions are only valid in the " "context of data, use DataFrame.eval" ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) ret = None first_expr = True @@ -305,13 +318,10 @@ def eval( for expr in exprs: expr = _convert_expression(expr) - engine = _check_engine(engine) - _check_parser(parser) - _check_resolvers(resolvers) _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope( + env = ensure_scope( level + 1, global_dict=global_dict, local_dict=local_dict, @@ -319,7 +329,7 @@ def eval( target=target, ) - parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) # construct the engine and evaluate the parsed expression eng = _engines[engine] @@ -329,8 +339,8 @@ def eval( if parsed_expr.assigner is None: if multi_line: raise ValueError( - "Multi-line expressions are only valid" - " if all expressions contain an assignment" + "Multi-line expressions are only valid " + "if all expressions contain an assignment" ) elif inplace: raise ValueError("Cannot operate inplace if there is no assignment") diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 45319a4d63d94..1350587b5ca90 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,20 +3,13 @@ import ast from functools import partial, reduce -from io import StringIO -import itertools as it -import operator +from keyword import iskeyword import tokenize -from typing import Type +from typing import Optional, Type import numpy as np -import pandas as pd -from pandas.core import common as com -from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, - _remove_spaces_column_name, -) +import pandas.core.common as com from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, @@ -35,38 +28,12 @@ _unary_ops_syms, is_term, ) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing -def tokenize_string(source): - """ - Tokenize a Python source code string. - - Parameters - ---------- - source : str - A Python source code string - """ - line_reader = StringIO(source).readline - token_generator = tokenize.generate_tokens(line_reader) - - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted - # string. - for toknum, tokval, _, _, _ in token_generator: - if tokval == "`": - tokval = " ".join( - it.takewhile( - lambda tokval: tokval != "`", - map(operator.itemgetter(1), token_generator), - ) - ) - toknum = _BACKTICK_QUOTED_STRING - yield toknum, tokval - - def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -134,31 +101,6 @@ def _replace_locals(tok): return toknum, tokval -def _clean_spaces_backtick_quoted_names(tok): - """Clean up a column name if surrounded by backticks. - - Backtick quoted string are indicated by a certain tokval value. If a string - is a backtick quoted token it will processed by - :func:`_remove_spaces_column_name` so that the parser can find this - string when the query is executed. - See also :meth:`NDFrame._get_space_character_free_column_resolver`. - - Parameters - ---------- - tok : tuple of int, str - ints correspond to the all caps constants in the tokenize module - - Returns - ------- - t : tuple of int, str - Either the input or token or the replacement values - """ - toknum, tokval = tok - if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _remove_spaces_column_name(tokval) - return toknum, tokval - - def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -171,12 +113,9 @@ def _compose(*funcs): def _preparse( - source, + source: str, f=_compose( - _replace_locals, - _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names, + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ): """Compose a collection of tokenization functions @@ -283,10 +222,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): # and we don't want `stmt` and friends in their so get only the class whose # names are capitalized _base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes -_msg = "cannot both support and not support {intersection}".format( - intersection=_unsupported_nodes & _base_supported_nodes -) -assert not _unsupported_nodes & _base_supported_nodes, _msg +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg def _node_not_implemented(node_name, cls): @@ -295,9 +233,7 @@ def _node_not_implemented(node_name, cls): """ def f(self, *args, **kwargs): - raise NotImplementedError( - "{name!r} nodes are not implemented".format(name=node_name) - ) + raise NotImplementedError(f"{repr(node_name)} nodes are not implemented") return f @@ -315,7 +251,7 @@ def disallowed(cls): cls.unsupported_nodes = () for node in nodes: new_method = _node_not_implemented(node, cls) - name = "visit_{node}".format(node=node) + name = f"visit_{node}" cls.unsupported_nodes += (name,) setattr(cls, name, new_method) return cls @@ -352,13 +288,13 @@ def add_ops(op_classes): def f(cls): for op_attr_name, op_class in op_classes.items(): - ops = getattr(cls, "{name}_ops".format(name=op_attr_name)) - ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name)) + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") for op in ops: op_node = ops_map[op] if op_node is not None: made_op = _op_maker(op_class, op) - setattr(cls, "visit_{node}".format(node=op_node), made_op) + setattr(cls, f"visit_{op_node}", made_op) return cls return f @@ -379,7 +315,7 @@ class BaseExprVisitor(ast.NodeVisitor): preparser : callable """ - const_type = Constant # type: Type[Term] + const_type: Type[Term] = Constant term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms @@ -430,13 +366,11 @@ def visit(self, node, **kwargs): try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: - from keyword import iskeyword - if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e - method = "visit_" + node.__class__.__name__ + method = "visit_" + type(node).__name__ visitor = getattr(self, method) return visitor(node, **kwargs) @@ -532,8 +466,8 @@ def _maybe_evaluate_binop( if res.has_invalid_return_type: raise TypeError( - "unsupported operand type(s) for {op}:" - " '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type) + f"unsupported operand type(s) for {res.op}:" + f" '{lhs.type}' and '{rhs.type}'" ) if self.engine != "pytables": @@ -565,8 +499,7 @@ def visit_BinOp(self, node, **kwargs): return self._maybe_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - truediv = self.env.scope["truediv"] - return lambda lhs, rhs: Div(lhs, rhs, truediv) + return lambda lhs, rhs: Div(lhs, rhs) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) @@ -600,6 +533,8 @@ def visit_Index(self, node, **kwargs): return self.visit(node.value) def visit_Subscript(self, node, **kwargs): + import pandas as pd + value = self.visit(node.value) slobj = self.visit(node.slice) result = pd.eval( @@ -679,7 +614,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) + raise ValueError(f"Invalid Attribute context {ctx.__name__}") def visit_Call(self, node, side=None, **kwargs): @@ -699,7 +634,7 @@ def visit_Call(self, node, side=None, **kwargs): raise if res is None: - raise ValueError("Invalid function call {func}".format(func=node.func.id)) + raise ValueError(f"Invalid function call {node.func.id}") if hasattr(res, "value"): res = res.value @@ -709,8 +644,7 @@ def visit_Call(self, node, side=None, **kwargs): if node.keywords: raise TypeError( - 'Function "{name}" does not support keyword ' - "arguments".format(name=res.name) + f'Function "{res.name}" does not support keyword arguments' ) return res(*new_args, **kwargs) @@ -721,10 +655,7 @@ def visit_Call(self, node, side=None, **kwargs): for key in node.keywords: if not isinstance(key, ast.keyword): - raise ValueError( - "keyword error in function call " - "'{func}'".format(func=node.func.id) - ) + raise ValueError(f"keyword error in function call '{node.func.id}'") if key.arg: kwargs[key.arg] = self.visit(key.value).value @@ -788,9 +719,7 @@ def __init__( parser, preparser=partial( _preparse, - f=_compose( - _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names - ), + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ): super().__init__(env, engine, parser, preparser) @@ -812,18 +741,25 @@ class Expr: engine : str, optional, default 'numexpr' parser : str, optional, default 'pandas' env : Scope, optional, default None - truediv : bool, optional, default True level : int, optional, default 2 """ + env: Scope + engine: str + parser: str + def __init__( - self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0 + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Optional[Scope] = None, + level: int = 0, ): self.expr = expr self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self.env.scope["truediv"] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() @@ -834,10 +770,10 @@ def assigner(self): def __call__(self): return self.terms(self.env) - def __repr__(self): + def __repr__(self) -> str: return printing.pprint_thing(self.terms) - def __len__(self): + def __len__(self) -> int: return len(self.expr) def parse(self): diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 46bc762e1a0b3..7e959889ee997 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -62,9 +62,8 @@ def set_numexpr_threads(n=None): ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b, reversed=False): +def _evaluate_standard(op, op_str, a, b): """ standard evaluation """ - # `reversed` kwarg is included for compatibility with _evaluate_numexpr if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -97,11 +96,12 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, reversed=False): +def _evaluate_numexpr(op, op_str, a, b): result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): - if reversed: + is_reversed = op.__name__.strip("_").startswith("r") + if is_reversed: # we were originally called by a reversed op method a, b = b, a @@ -109,7 +109,7 @@ def _evaluate_numexpr(op, op_str, a, b, reversed=False): b_value = getattr(b, "values", b) result = ne.evaluate( - "a_value {op} b_value".format(op=op_str), + f"a_value {op_str} b_value", local_dict={"a_value": a_value, "b_value": b_value}, casting="safe", ) @@ -175,22 +175,20 @@ def _bool_arith_check( if _has_bool_dtype(a) and _has_bool_dtype(b): if op_str in unsupported: warnings.warn( - "evaluating in Python space because the {op!r} " - "operator is not supported by numexpr for " - "the bool dtype, use {alt_op!r} instead".format( - op=op_str, alt_op=unsupported[op_str] - ) + f"evaluating in Python space because the {repr(op_str)} " + f"operator is not supported by numexpr for " + f"the bool dtype, use {repr(unsupported[op_str])} instead" ) return False if op_str in not_allowed: raise NotImplementedError( - "operator {op!r} not implemented for bool dtypes".format(op=op_str) + f"operator {repr(op_str)} not implemented for bool dtypes" ) return True -def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): +def evaluate(op, op_str, a, b, use_numexpr=True): """ Evaluate and return the expression of the op on a and b. @@ -203,12 +201,11 @@ def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): b : right operand use_numexpr : bool, default True Whether to try to use numexpr. - reversed : bool, default False """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b, reversed=reversed) + return _evaluate(op, op_str, a, b) return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index dc0f381414970..cb166ba65152b 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import _ensure_decoded, _result_type_many +from pandas.core.computation.common import _ensure_decoded, result_type_many from pandas.core.computation.scope import _DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -55,12 +55,13 @@ class UndefinedVariableError(NameError): NameError subclass for local variables. """ - def __init__(self, name, is_local): + def __init__(self, name, is_local: bool): + base_msg = f"{repr(name)} is not defined" if is_local: - msg = "local variable {0!r} is not defined" + msg = f"local variable {base_msg}" else: - msg = "name {0!r} is not defined" - super().__init__(msg.format(name)) + msg = f"name {base_msg}" + super().__init__(msg) class Term: @@ -69,7 +70,10 @@ def __new__(cls, name, env, side=None, encoding=None): supr_new = super(Term, klass).__new__ return supr_new(klass) + is_local: bool + def __init__(self, name, env, side=None, encoding=None): + # name is a str for Term, but may be something else for subclasses self._name = name self.env = env self.side = side @@ -79,10 +83,10 @@ def __init__(self, name, env, side=None, encoding=None): self.encoding = encoding @property - def local_name(self): + def local_name(self) -> str: return self.name.replace(_LOCAL_TAG, "") - def __repr__(self): + def __repr__(self) -> str: return pprint_thing(self.name) def __call__(self, *args, **kwargs): @@ -120,7 +124,7 @@ def update(self, value): self.value = value @property - def is_scalar(self): + def is_scalar(self) -> bool: return is_scalar(self._value) @property @@ -139,14 +143,11 @@ def type(self): return_type = type @property - def raw(self): - return pprint_thing( - "{0}(name={1!r}, type={2})" - "".format(self.__class__.__name__, self.name, self.type) - ) + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.type.type except AttributeError: @@ -167,7 +168,7 @@ def name(self): return self._name @property - def ndim(self): + def ndim(self) -> int: return self._value.ndim @@ -182,7 +183,7 @@ def _resolve_name(self): def name(self): return self.value - def __repr__(self): + def __repr__(self) -> str: # in python 2 str() of float # can truncate shorter than repr() return repr(self.name) @@ -196,7 +197,9 @@ class Op: Hold an operator of arbitrary arity. """ - def __init__(self, op, operands, *args, **kwargs): + op: str + + def __init__(self, op: str, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands self.encoding = kwargs.get("encoding", None) @@ -204,23 +207,23 @@ def __init__(self, op, operands, *args, **kwargs): def __iter__(self): return iter(self.operands) - def __repr__(self): + def __repr__(self) -> str: """ Print a generic n-ary operator and its operands using infix notation. """ # recurse over the operands - parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands) - return pprint_thing(" {0} ".format(self.op).join(parened)) + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) @property def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ - return _result_type_many(*(term.type for term in com.flatten(self))) + return result_type_many(*(term.type for term in com.flatten(self))) @property - def has_invalid_return_type(self): + def has_invalid_return_type(self) -> bool: types = self.operand_types obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @@ -230,11 +233,11 @@ def operand_types(self): return frozenset(term.type for term in com.flatten(self)) @property - def is_scalar(self): + def is_scalar(self) -> bool: return all(operand.is_scalar for operand in self.operands) @property - def is_datetime(self): + def is_datetime(self) -> bool: try: t = self.return_type.type except AttributeError: @@ -339,7 +342,7 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): term.update(new_value) -def is_term(obj): +def is_term(obj) -> bool: return isinstance(obj, Term) @@ -354,7 +357,7 @@ class BinOp(Op): right : Term or Op """ - def __init__(self, op, lhs, rhs, **kwargs): + def __init__(self, op: str, lhs, rhs, **kwargs): super().__init__(op, (lhs, rhs)) self.lhs = lhs self.rhs = rhs @@ -369,8 +372,7 @@ def __init__(self, op, lhs, rhs, **kwargs): # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) raise ValueError( - "Invalid binary operator {0!r}, valid" - " operators are {1}".format(op, keys) + f"Invalid binary operator {repr(op)}, valid operators are {keys}" ) def __call__(self, env): @@ -386,9 +388,6 @@ def __call__(self, env): object The result of an evaluated expression. """ - # handle truediv - if self.op == "/" and env.scope["truediv"]: - self.func = operator.truediv # recurse over the left/right nodes left = self.lhs(env) @@ -396,7 +395,7 @@ def __call__(self, env): return self.func(left, right) - def evaluate(self, env, engine, parser, term_type, eval_in_python): + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): """ Evaluate a binary operation *before* being passed to the engine. @@ -488,7 +487,7 @@ def _disallow_scalar_only_bool_ops(self): raise NotImplementedError("cannot evaluate scalar only bool ops") -def isnumeric(dtype): +def isnumeric(dtype) -> bool: return issubclass(np.dtype(dtype).type, np.number) @@ -500,18 +499,15 @@ class Div(BinOp): ---------- lhs, rhs : Term or Op The Terms or Ops in the ``/`` expression. - truediv : bool - Whether or not to use true division. With Python 3 this happens - regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv, *args, **kwargs): - super().__init__("/", lhs, rhs, *args, **kwargs) + def __init__(self, lhs, rhs, **kwargs): + super().__init__("/", lhs, rhs, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): raise TypeError( - "unsupported operand type(s) for {0}:" - " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type) + f"unsupported operand type(s) for {self.op}: " + f"'{lhs.return_type}' and '{rhs.return_type}'" ) # do not upcast float32s to float64 un-necessarily @@ -541,7 +537,7 @@ class UnaryOp(Op): * If no function associated with the passed operator token is found. """ - def __init__(self, op, operand): + def __init__(self, op: str, operand): super().__init__(op, (operand,)) self.operand = operand @@ -549,19 +545,19 @@ def __init__(self, op, operand): self.func = _unary_ops_dict[op] except KeyError: raise ValueError( - "Invalid unary operator {0!r}, valid operators " - "are {1}".format(op, _unary_ops_syms) + f"Invalid unary operator {repr(op)}, " + f"valid operators are {_unary_ops_syms}" ) def __call__(self, env): operand = self.operand(env) return self.func(operand) - def __repr__(self): - return pprint_thing("{0}({1})".format(self.op, self.operand)) + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") @property - def return_type(self): + def return_type(self) -> np.dtype: operand = self.operand if operand.return_type == np.dtype("bool"): return np.dtype("bool") @@ -582,13 +578,13 @@ def __call__(self, env): with np.errstate(all="ignore"): return self.func.func(*operands) - def __repr__(self): + def __repr__(self) -> str: operands = map(str, self.operands) - return pprint_thing("{0}({1})".format(self.op, ",".join(operands))) + return pprint_thing(f"{self.op}({','.join(operands)})") class FuncNode: - def __init__(self, name): + def __init__(self, name: str): from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION if name not in _mathops or ( @@ -596,7 +592,7 @@ def __init__(self, name): and _NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") ): - raise ValueError('"{0}" is not a supported function'.format(name)) + raise ValueError(f'"{name}" is not a supported function') self.name = name self.func = getattr(np, name) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000..ce213c8532834 --- /dev/null +++ b/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 81658ab23ba46..be652ca0e6a36 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -2,6 +2,7 @@ import ast from functools import partial +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -12,7 +13,7 @@ import pandas as pd import pandas.core.common as com -from pandas.core.computation import expr, ops +from pandas.core.computation import expr, ops, scope as _scope from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term @@ -20,27 +21,38 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -class Scope(expr.Scope): +class PyTablesScope(_scope.Scope): __slots__ = ("queryables",) - def __init__(self, level, global_dict=None, local_dict=None, queryables=None): + queryables: Dict[str, Any] + + def __init__( + self, + level: int, + global_dict=None, + local_dict=None, + queryables: Optional[Dict[str, Any]] = None, + ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() class Term(ops.Term): + env: PyTablesScope + def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls return object.__new__(klass) - def __init__(self, name, env, side=None, encoding=None): + def __init__(self, name, env: PyTablesScope, side=None, encoding=None): super().__init__(name, env, side=side, encoding=encoding) def _resolve_name(self): # must be a queryables if self.side == "left": + # Note: The behavior of __new__ ensures that self.name is a str here if self.name not in self.env.queryables: - raise NameError("name {name!r} is not defined".format(name=self.name)) + raise NameError(f"name {repr(self.name)} is not defined") return self.name # resolve the rhs (and allow it to be None) @@ -56,7 +68,8 @@ def value(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): + def __init__(self, value, env: PyTablesScope, side=None, encoding=None): + assert isinstance(env, PyTablesScope), type(env) super().__init__(value, env, side=side, encoding=encoding) def _resolve_name(self): @@ -67,11 +80,13 @@ class BinOp(ops.BinOp): _max_selectors = 31 - def __init__(self, op, lhs, rhs, queryables, encoding): + op: str + queryables: Dict[str, Any] + + def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): super().__init__(op, lhs, rhs) self.queryables = queryables self.encoding = encoding - self.filter = None self.condition = None def _disallow_scalar_only_bool_ops(self): @@ -129,12 +144,12 @@ def conform(self, rhs): return rhs @property - def is_valid(self): + def is_valid(self) -> bool: """ return True if this is a valid field """ return self.lhs in self.queryables @property - def is_in_table(self): + def is_in_table(self) -> bool: """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ return self.queryables.get(self.lhs) is not None @@ -154,12 +169,12 @@ def metadata(self): """ the metadata of my field """ return getattr(self.queryables.get(self.lhs), "metadata", None) - def generate(self, v): + def generate(self, v) -> str: """ create and return the op string for this TermValue """ val = v.tostring(self.encoding) - return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val) + return f"({self.lhs} {self.op} {val})" - def convert_value(self, v): + def convert_value(self, v) -> "TermValue": """ convert the expression that is in the term to something that is accepted by pytables """ @@ -218,21 +233,19 @@ def stringify(value): # string quoting return TermValue(v, stringify(v), "string") else: - raise TypeError( - "Cannot compare {v} of type {typ} to {kind} column".format( - v=v, typ=type(v), kind=kind - ) - ) + raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") def convert_values(self): pass class FilterBinOp(BinOp): - def __repr__(self): - return pprint_thing( - "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1]) - ) + filter: Optional[Tuple[Any, Any, pd.Index]] = None + + def __repr__(self) -> str: + if self.filter is None: + return "Filter: Not Initialized" + return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]") def invert(self): """ invert the filter """ @@ -249,10 +262,10 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]".format(slf=self)) + raise ValueError(f"query term is not valid [{self}]") rhs = self.conform(self.rhs) - values = [TermValue(v, v, self.kind).value for v in rhs] + values = list(rhs) if self.is_in_table: @@ -273,13 +286,12 @@ def evaluate(self): else: raise TypeError( - "passing a filterable condition to a non-table " - "indexer [{slf}]".format(slf=self) + f"passing a filterable condition to a non-table indexer [{self}]" ) return self - def generate_filter_op(self, invert=False): + def generate_filter_op(self, invert: bool = False): if (self.op == "!=" and not invert) or (self.op == "==" and invert): return lambda axis, vals: ~axis.isin(vals) else: @@ -295,8 +307,8 @@ def evaluate(self): class ConditionBinOp(BinOp): - def __repr__(self): - return pprint_thing("[Condition : [{cond}]]".format(cond=self.condition)) + def __repr__(self) -> str: + return pprint_thing(f"[Condition : [{self.condition}]]") def invert(self): """ invert the condition """ @@ -314,7 +326,7 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]".format(slf=self)) + raise ValueError(f"query term is not valid [{self}]") # convert values if we are in the table if not self.is_in_table: @@ -329,7 +341,7 @@ def evaluate(self): # too many values to create the expression? if len(values) <= self._max_selectors: vs = [self.generate(v) for v in values] - self.condition = "({cond})".format(cond=" | ".join(vs)) + self.condition = f"({' | '.join(vs)})" # use a filter after reading else: @@ -342,9 +354,7 @@ def evaluate(self): class JointConditionBinOp(ConditionBinOp): def evaluate(self): - self.condition = "({lhs} {op} {rhs})".format( - lhs=self.lhs.condition, op=self.op, rhs=self.rhs.condition - ) + self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})" return self @@ -368,10 +378,7 @@ def prune(self, klass): return None -_op_classes = {"unary": UnaryOp} - - -class ExprVisitor(BaseExprVisitor): +class PyTablesExprVisitor(BaseExprVisitor): const_type = Constant term_type = Term @@ -381,7 +388,7 @@ def __init__(self, env, engine, parser, **kwargs): bin_node = self.binary_op_nodes_map[bin_op] setattr( self, - "visit_{node}".format(node=bin_node), + f"visit_{bin_node}", lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), ) @@ -415,16 +422,13 @@ def visit_Subscript(self, node, **kwargs): try: return self.const_type(value[slobj], self.env) except TypeError: - raise ValueError( - "cannot subscript {value!r} with " - "{slobj!r}".format(value=value, slobj=slobj) - ) + raise ValueError(f"cannot subscript {repr(value)} with {repr(slobj)}") def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value - ctx = node.ctx.__class__ + ctx = type(node.ctx) if ctx == ast.Load: # resolve the value resolved = self.visit(value) @@ -443,7 +447,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) + raise ValueError(f"Invalid Attribute context {ctx.__name__}") def translate_In(self, op): return ast.Eq() if isinstance(op, ast.In) else op @@ -471,25 +475,29 @@ def _validate_where(w): TypeError : An invalid data type was passed in for w (e.g. dict). """ - if not (isinstance(w, (Expr, str)) or is_list_like(w)): - raise TypeError("where must be passed as a string, Expr, or list-like of Exprs") + if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)): + raise TypeError( + "where must be passed as a string, PyTablesExpr, " + "or list-like of PyTablesExpr" + ) return w -class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' +class PyTablesExpr(expr.Expr): + """ + Hold a pytables-like expression, comprised of possibly multiple 'terms'. Parameters ---------- - where : string term expression, Expr, or list-like of Exprs + where : string term expression, PyTablesExpr, or list-like of PyTablesExprs queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable encoding : an encoding that will encode the query terms Returns ------- - an Expr object + a PyTablesExpr object Examples -------- @@ -505,7 +513,16 @@ class Expr(expr.Expr): "major_axis>=20130101" """ - def __init__(self, where, queryables=None, encoding=None, scope_level=0): + _visitor: Optional[PyTablesExprVisitor] + env: PyTablesScope + + def __init__( + self, + where, + queryables: Optional[Dict[str, Any]] = None, + encoding=None, + scope_level: int = 0, + ): where = _validate_where(where) @@ -516,27 +533,30 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): self._visitor = None # capture the environment if needed - local_dict = DeepChainMap() + local_dict: DeepChainMap[Any, Any] = DeepChainMap() - if isinstance(where, Expr): + if isinstance(where, PyTablesExpr): local_dict = where.env.scope - where = where.expr + _where = where.expr elif isinstance(where, (list, tuple)): + where = list(where) for idx, w in enumerate(where): - if isinstance(w, Expr): + if isinstance(w, PyTablesExpr): local_dict = w.env.scope else: w = _validate_where(w) where[idx] = w - where = " & ".join(map("({})".format, com.flatten(where))) # noqa + _where = " & ".join((f"({w})" for w in com.flatten(where))) + else: + _where = where - self.expr = where - self.env = Scope(scope_level + 1, local_dict=local_dict) + self.expr = _where + self.env = PyTablesScope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) - self._visitor = ExprVisitor( + self._visitor = PyTablesExprVisitor( self.env, queryables=queryables, parser="pytables", @@ -545,7 +565,7 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): ) self.terms = self.parse() - def __repr__(self): + def __repr__(self) -> str: if self.terms is not None: return pprint_thing(self.terms) return pprint_thing(self.expr) @@ -557,15 +577,15 @@ def evaluate(self): self.condition = self.terms.prune(ConditionBinOp) except AttributeError: raise ValueError( - "cannot process expression [{expr}], [{slf}] " - "is not a valid condition".format(expr=self.expr, slf=self) + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid condition" ) try: self.filter = self.terms.prune(FilterBinOp) except AttributeError: raise ValueError( - "cannot process expression [{expr}], [{slf}] " - "is not a valid filter".format(expr=self.expr, slf=self) + f"cannot process expression [{self.expr}], [{self}] " + "is not a valid filter" ) return self.condition, self.filter @@ -574,30 +594,31 @@ def evaluate(self): class TermValue: """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind): + def __init__(self, value, converted, kind: str): + assert isinstance(kind, str), kind self.value = value self.converted = converted self.kind = kind - def tostring(self, encoding): + def tostring(self, encoding) -> str: """ quote the string if not encoded else encode and return """ if self.kind == "string": if encoding is not None: - return self.converted - return '"{converted}"'.format(converted=self.converted) + return str(self.converted) + return f'"{self.converted}"' elif self.kind == "float": # python 2 str(float) is not always # round-trippable so use repr() return repr(self.converted) - return self.converted + return str(self.converted) -def maybe_expression(s): +def maybe_expression(s) -> bool: """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False - ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",) + ops = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",) # make sure we have an op at least return any(op in s for op in ops) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index b11411eb2dc66..70dcf4defdb52 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -9,6 +9,7 @@ import pprint import struct import sys +from typing import List import numpy as np @@ -16,9 +17,9 @@ from pandas.compat.chainmap import DeepChainMap -def _ensure_scope( - level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs -): +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +) -> "Scope": """Ensure that we are grabbing the correct scope.""" return Scope( level + 1, @@ -29,7 +30,7 @@ def _ensure_scope( ) -def _replacer(x): +def _replacer(x) -> str: """Replace a number with its hexadecimal representation. Used to tag temporary variables with their calling scope's id. """ @@ -44,11 +45,11 @@ def _replacer(x): return hex(hexin) -def _raw_hex_id(obj): +def _raw_hex_id(obj) -> str: """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack("@P", id(obj)) - return "".join(map(_replacer, packed)) + return "".join(_replacer(x) for x in packed) _DEFAULT_GLOBALS = { @@ -63,7 +64,7 @@ def _raw_hex_id(obj): } -def _get_pretty_string(obj): +def _get_pretty_string(obj) -> str: """ Return a prettier version of obj. @@ -74,7 +75,7 @@ def _get_pretty_string(obj): Returns ------- - s : str + str Pretty print object repr """ sio = StringIO() @@ -119,7 +120,7 @@ def __init__( self.scope.update(local_dict.scope) if local_dict.target is not None: self.target = local_dict.target - self.update(local_dict.level) + self._update(local_dict.level) frame = sys._getframe(self.level) @@ -139,17 +140,16 @@ def __init__( self.resolvers = DeepChainMap(*resolvers) self.temps = {} - def __repr__(self): + def __repr__(self) -> str: scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - unicode_str = "{name}(scope={scope_keys}, resolvers={res_keys})" - return unicode_str.format( - name=type(self).__name__, scope_keys=scope_keys, res_keys=res_keys - ) + unicode_str = f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + return unicode_str @property - def has_resolvers(self): - """Return whether we have any extra scope. + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. For example, DataFrames pass Their columns as resolvers during calls to ``DataFrame.eval()`` and ``DataFrame.query()``. @@ -160,7 +160,7 @@ def has_resolvers(self): """ return bool(len(self.resolvers)) - def resolve(self, key, is_local): + def resolve(self, key: str, is_local: bool): """ Resolve a variable name in a possibly local context. @@ -202,7 +202,7 @@ def resolve(self, key, is_local): raise UndefinedVariableError(key, is_local) - def swapkey(self, old_key, new_key, new_value=None): + def swapkey(self, old_key: str, new_key: str, new_value=None): """ Replace a variable name, with a potentially new value. @@ -227,7 +227,7 @@ def swapkey(self, old_key, new_key, new_value=None): mapping[new_key] = new_value return - def _get_vars(self, stack, scopes): + def _get_vars(self, stack, scopes: List[str]): """ Get specifically scoped variables from a list of stack frames. @@ -250,13 +250,13 @@ def _get_vars(self, stack, scopes): # scope after the loop del frame - def update(self, level): + def _update(self, level: int): """ Update the current scope by going back `level` levels. Parameters ---------- - level : int or None, optional, default None + level : int """ sl = level + 1 @@ -270,7 +270,7 @@ def update(self, level): finally: del stack[:], stack - def add_tmp(self, value): + def add_tmp(self, value) -> str: """ Add a temporary variable to the scope. @@ -281,12 +281,10 @@ def add_tmp(self, value): Returns ------- - name : basestring + str The name of the temporary variable created. """ - name = "{name}_{num}_{hex_id}".format( - name=type(value).__name__, num=self.ntemps, hex_id=_raw_hex_id(self) - ) + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" # add to inner most scope assert name not in self.temps @@ -297,7 +295,7 @@ def add_tmp(self, value): return name @property - def ntemps(self): + def ntemps(self) -> int: """The number of temporary variables in this scope""" return len(self.temps) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ba0a4d81a88d3..afdd8a01ee003 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -300,14 +300,15 @@ def table_schema_cb(key): _enable_data_resource_formatter(cf.get_option(key)) -def is_terminal(): +def is_terminal() -> bool: """ Detect if Python is running in a terminal. Returns True if Python is running in a terminal or False if not. """ try: - ip = get_ipython() + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore except NameError: # assume standard Python interpreter in a terminal return True else: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5e8b28267f24f..203ef3ec75c8f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,13 +4,14 @@ These should not depend on core.internals. """ -from typing import Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast import numpy as np import numpy.ma as ma from pandas._libs import lib from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime +from pandas._typing import ArrayLike, Dtype from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -27,7 +28,6 @@ is_categorical_dtype, is_datetime64_ns_dtype, is_extension_array_dtype, - is_extension_type, is_float_dtype, is_integer_dtype, is_iterator, @@ -47,6 +47,10 @@ import pandas.core.common as com +if TYPE_CHECKING: + from pandas.core.series import Series # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + def array( data: Sequence[object], @@ -95,10 +99,19 @@ def array( :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`str` :class:`pandas.arrays.StringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. + .. versionchanged:: 1.0.0 + + Pandas infers nullable-integer dtype for integer data, + string dtype for string data, and nullable-boolean dtype + for boolean data. + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -155,14 +168,6 @@ def array( ['a', 'b'] Length: 2, dtype: str32 - Or use the dedicated constructor for the array you're expecting, and - wrap that in a PandasArray - - >>> pd.array(np.array(['a', 'b'], dtype=' - ['a', 'b'] - Length: 2, dtype: str32 - Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` @@ -185,20 +190,28 @@ def array( Examples -------- - If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) - + [1, 2] - Length: 2, dtype: int64 + Length: 2, dtype: Int64 - Or the NumPy dtype can be specified + >>> pd.array([1, 2, np.nan]) + + [1, 2, NaN] + Length: 3, dtype: Int64 - >>> pd.array([1, 2], dtype=np.dtype("int32")) - - [1, 2] - Length: 2, dtype: int32 + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] You can use the string alias for `dtype` @@ -213,29 +226,24 @@ def array( [a, b, a] Categories (3, object): [a < b < c] - Because omitting the `dtype` passes the data through to NumPy, - a mixture of valid integers and NA will return a floating-point - NumPy array. + If pandas does not infer a dedicated extension type a + :class:`arrays.PandasArray` is returned. - >>> pd.array([1, 2, np.nan]) + >>> pd.array([1.1, 2.2]) - [1.0, 2.0, nan] - Length: 3, dtype: float64 - - To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify - the dtype: + [1.1, 2.2] + Length: 2, dtype: float64 - >>> pd.array([1, 2, np.nan], dtype='Int64') - - [1, 2, NaN] - Length: 3, dtype: Int64 - - Pandas will infer an ExtensionArray for some types of data: + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` + as a NumPy dtype if you need to ensure there's no future change in + behavior. - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) - - ['2000-01-01', '2000-01-01'] - Length: 2, dtype: period[D] + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. @@ -247,21 +255,26 @@ def array( """ from pandas.core.arrays import ( period_array, + BooleanArray, + IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, + StringArray, ) if lib.is_scalar(data): - msg = "Cannot pass scalar '{}' to 'pandas.array'." - raise ValueError(msg.format(data)) + msg = f"Cannot pass scalar '{data}' to 'pandas.array'." + raise ValueError(msg) - data = extract_array(data, extract_numpy=True) - - if dtype is None and isinstance(data, ABCExtensionArray): + if dtype is None and isinstance( + data, (ABCSeries, ABCIndexClass, ABCExtensionArray) + ): dtype = data.dtype + data = extract_array(data, extract_numpy=True) + # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype @@ -271,7 +284,7 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) + inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) @@ -299,7 +312,14 @@ def array( # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) - # TODO(BooleanArray): handle this type + elif inferred_dtype == "string": + return StringArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "integer": + return IntegerArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] @@ -477,13 +497,8 @@ def sanitize_array( if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "period": - from pandas.core.arrays import period_array - - try: - subarr = period_array(subarr) - except IncompatibleFrequency: - pass + if inferred in {"interval", "period"}: + subarr = array(subarr) return subarr @@ -527,7 +542,7 @@ def _try_cast( and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) ): subarr = construct_1d_object_array_from_listlike(subarr) - elif not is_extension_type(subarr): + elif not is_extension_array_dtype(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise @@ -538,7 +553,7 @@ def _try_cast( # that Categorical is the only array type for 'category'. dtype = cast(CategoricalDtype, dtype) subarr = dtype.construct_array_type()( - arr, dtype.categories, ordered=dtype._ordered + arr, dtype.categories, ordered=dtype.ordered ) elif is_extension_array_dtype(dtype): # create an extension array from its dtype @@ -550,3 +565,62 @@ def _try_cast( else: subarr = np.array(arr, dtype=object, copy=copy) return subarr + + +def is_empty_data(data: Any) -> bool: + """ + Utility to check if a Series is instantiated with empty data, + which does not contain dtype information. + + Parameters + ---------- + data : array-like, Iterable, dict, or scalar value + Contains data stored in Series. + + Returns + ------- + bool + """ + is_none = data is None + is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype") + is_simple_empty = is_list_like_without_dtype and not data + return is_none or is_simple_empty + + +def create_series_with_explicit_dtype( + data: Any = None, + index: Optional[Union[ArrayLike, "Index"]] = None, + dtype: Optional[Dtype] = None, + name: Optional[str] = None, + copy: bool = False, + fastpath: bool = False, + dtype_if_empty: Dtype = object, +) -> "Series": + """ + Helper to pass an explicit dtype when instantiating an empty Series. + + This silences a DeprecationWarning described in GitHub-17261. + + Parameters + ---------- + data : Mirrored from Series.__init__ + index : Mirrored from Series.__init__ + dtype : Mirrored from Series.__init__ + name : Mirrored from Series.__init__ + copy : Mirrored from Series.__init__ + fastpath : Mirrored from Series.__init__ + dtype_if_empty : str, numpy.dtype, or ExtensionDtype + This dtype will be passed explicitly if an empty Series will + be instantiated. + + Returns + ------- + Series + """ + from pandas.core.series import Series + + if is_empty_data(data) and dtype is None: + dtype = dtype_if_empty + return Series( + data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath + ) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index 2b527e1fb5890..051affd0af1f9 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,6 +1,6 @@ # flake8: noqa -from .common import ( +from pandas.core.dtypes.common import ( is_array_like, is_bool, is_bool_dtype, @@ -12,7 +12,6 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -32,7 +31,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_period, is_period_dtype, is_re, is_re_compilable, diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 59ef17e3d121f..1b4e7062b38e5 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -1,5 +1,5 @@ """Extend pandas with custom array types""" -from typing import List, Optional, Tuple, Type +from typing import Any, List, Optional, Tuple, Type import numpy as np @@ -63,19 +63,32 @@ class property**. Added ``_metadata``, ``__hash__``, and changed the default definition of ``__eq__``. + For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method + can be implemented: this method receives a pyarrow Array or ChunkedArray + as only argument and is expected to return the appropriate pandas + ExtensionArray for this dtype and the passed values:: + + class ExtensionDtype: + + def __from_arrow__( + self, array: pyarrow.Array/ChunkedArray + ) -> ExtensionArray: + ... + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is provided for registering virtual subclasses. """ - _metadata = () # type: Tuple[str, ...] + _metadata: Tuple[str, ...] = () - def __str__(self): + def __str__(self) -> str: return self.name - def __eq__(self, other): - """Check whether 'other' is equal to self. + def __eq__(self, other: Any) -> bool: + """ + Check whether 'other' is equal to self. By default, 'other' is considered equal if either @@ -103,10 +116,10 @@ def __eq__(self, other): ) return False - def __hash__(self): + def __hash__(self) -> int: return hash(tuple(getattr(self, attr) for attr in self._metadata)) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property @@ -159,7 +172,8 @@ def name(self) -> str: @property def names(self) -> Optional[List[str]]: - """Ordered list of field names, or None if there are no fields. + """ + Ordered list of field names, or None if there are no fields. This is for compatibility with NumPy arrays, and may be removed in the future. @@ -169,7 +183,7 @@ def names(self) -> Optional[List[str]]: @classmethod def construct_array_type(cls): """ - Return the array type associated with this dtype + Return the array type associated with this dtype. Returns ------- @@ -217,20 +231,23 @@ def construct_from_string(cls, string: str): ... if match: ... return cls(**match.groupdict()) ... else: - ... raise TypeError("Cannot construct a '{}' from " - ... "'{}'".format(cls.__name__, string)) + ... raise TypeError(f"Cannot construct a '{cls.__name__}' from + ... " "'{string}'") """ if not isinstance(string, str): - raise TypeError("Expects a string, got {}".format(type(string))) + raise TypeError(f"Expects a string, got {type(string).__name__}") + + # error: Non-overlapping equality check (left operand type: "str", right + # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] + assert isinstance(cls.name, str), (cls, type(cls.name)) if string != cls.name: - raise TypeError( - "Cannot construct a '{}' from '{}'".format(cls.__name__, string) - ) + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") return cls() @classmethod def is_dtype(cls, dtype) -> bool: - """Check if we match 'dtype'. + """ + Check if we match 'dtype'. Parameters ---------- @@ -263,10 +280,12 @@ def is_dtype(cls, dtype) -> bool: return False elif isinstance(dtype, cls): return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False @property def _is_numeric(self) -> bool: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3e92906be706c..1dbdb8dbba48b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,9 +6,10 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg -from .common import ( +from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, @@ -27,10 +28,8 @@ is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_datetimelike, is_dtype_equal, is_extension_array_dtype, - is_extension_type, is_float, is_float_dtype, is_integer, @@ -43,8 +42,13 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from .dtypes import DatetimeTZDtype, ExtensionDtype, PeriodDtype -from .generic import ( +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, @@ -52,8 +56,8 @@ ABCPeriodIndex, ABCSeries, ) -from .inference import is_list_like -from .missing import isna, notna +from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.missing import isna, notna _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -74,7 +78,7 @@ def maybe_convert_platform(values): return values -def is_nested_object(obj): +def is_nested_object(obj) -> bool: """ return a boolean if we have a nested object, e.g. a Series with 1 or more Series elements @@ -232,7 +236,7 @@ def trans(x): return result -def maybe_upcast_putmask(result, mask, other): +def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. The result is replaced with the first N elements of other, @@ -245,14 +249,14 @@ def maybe_upcast_putmask(result, mask, other): The destination array. This will be mutated in-place if no upcasting is necessary. mask : boolean ndarray - other : ndarray or scalar - The source array or value + other : scalar + The source value. Returns ------- result : ndarray - changed : boolean - Set to true if the result array was upcasted + changed : bool + Set to true if the result array was upcasted. Examples -------- @@ -264,13 +268,17 @@ def maybe_upcast_putmask(result, mask, other): if not isinstance(result, np.ndarray): raise ValueError("The result input must be a ndarray.") + if not is_scalar(other): + # We _could_ support non-scalar other, but until we have a compelling + # use case, we assume away the possibility. + raise ValueError("other must be a scalar") if mask.any(): # Two conversions for date-like dtypes that can't be done automatically # in np.place: # NaN -> NaT # integer or integer array -> date-like array - if is_datetimelike(result.dtype): + if result.dtype.kind in ["m", "M"]: if is_scalar(other): if isna(other): other = result.dtype.type("nat") @@ -335,6 +343,26 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): + """ + Find the minimal dtype that can hold both the given dtype and fill_value. + + Parameters + ---------- + dtype : np.dtype or ExtensionDtype + fill_value : scalar, default np.nan + + Returns + ------- + dtype + Upcasted from dtype argument if necessary. + fill_value + Upcasted from fill_value argument if necessary. + """ + if not is_scalar(fill_value) and not is_object_dtype(dtype): + # with object dtype there is nothing to promote, and the user can + # pass pretty much any weird fill_value they like + raise ValueError("fill_value must be a scalar") + # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): @@ -387,6 +415,14 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT + elif not isinstance(fill_value, datetime): + dtype = np.dtype(np.object_) + elif fill_value.tzinfo is None: + dtype = np.dtype(np.object_) + elif not tz_compare(fill_value.tzinfo, dtype.tz): + # TODO: sure we want to cast here? + dtype = np.dtype(np.object_) + elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value @@ -491,13 +527,13 @@ def _ensure_dtype_type(value, dtype): return dtype.type(value) -def infer_dtype_from(val, pandas_dtype=False): +def infer_dtype_from(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar or array. This is a convenience - routines to infer dtype from a scalar or an array + Interpret the dtype from a scalar or array. Parameters ---------- + val : object pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, scalar/array belongs to pandas extension types is inferred as @@ -508,9 +544,9 @@ def infer_dtype_from(val, pandas_dtype=False): return infer_dtype_from_array(val, pandas_dtype=pandas_dtype) -def infer_dtype_from_scalar(val, pandas_dtype=False): +def infer_dtype_from_scalar(val, pandas_dtype: bool = False): """ - interpret the dtype from a scalar + Interpret the dtype from a scalar. Parameters ---------- @@ -579,17 +615,20 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) val = val.ordinal + elif lib.is_interval(val): + subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] + dtype = IntervalDtype(subtype=subtype) return dtype, val -def infer_dtype_from_array(arr, pandas_dtype=False): +def infer_dtype_from_array(arr, pandas_dtype: bool = False): """ - infer the dtype from a scalar or array + Infer the dtype from an array. Parameters ---------- - arr : scalar or array + arr : array pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, array belongs to pandas extension types @@ -615,7 +654,6 @@ def infer_dtype_from_array(arr, pandas_dtype=False): >>> infer_dtype_from_array([1, '1']) (numpy.object_, [1, '1']) - """ if isinstance(arr, np.ndarray): @@ -624,7 +662,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False): if not is_list_like(arr): arr = [arr] - if pandas_dtype and is_extension_type(arr): + if pandas_dtype and is_extension_array_dtype(arr): return arr.dtype, arr elif isinstance(arr, ABCSeries): @@ -640,7 +678,8 @@ def infer_dtype_from_array(arr, pandas_dtype=False): def maybe_infer_dtype_type(element): - """Try to infer an object's dtype, for use in arithmetic ops + """ + Try to infer an object's dtype, for use in arithmetic ops. Uses `element.dtype` if that's available. Objects implementing the iterator protocol are cast to a NumPy array, @@ -672,18 +711,24 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explicit type promotion and coercion +def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): + """ + Provide explicit type promotion and coercion. Parameters ---------- - values : the ndarray that we want to maybe upcast + values : ndarray or ExtensionArray + The array that we want to maybe upcast. fill_value : what we want to fill with dtype : if None, then use the dtype of the values, else coerce to this type - copy : if True always make a copy even if no upcast is required + copy : bool, default True + If True always make a copy even if no upcast is required. """ + if not is_scalar(fill_value) and not is_object_dtype(values.dtype): + # We allow arbitrary fill values for object dtype + raise ValueError("fill_value must be a scalar") - if is_extension_type(values): + if is_extension_array_dtype(values): if copy: values = values.copy() else: @@ -749,7 +794,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy=True, skipna=False): +def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -783,21 +828,22 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) - raise TypeError( - "cannot astype a datetimelike from [{from_dtype}] " - "to [{to_dtype}]".format(from_dtype=arr.dtype, to_dtype=dtype) - ) + raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: @@ -812,10 +858,7 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): elif dtype == _TD_DTYPE: return arr.astype(_TD_DTYPE, copy=copy) - raise TypeError( - "cannot astype a timedelta from [{from_dtype}] " - "to [{to_dtype}]".format(from_dtype=arr.dtype, to_dtype=dtype) - ) + raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): @@ -841,8 +884,11 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): - msg = "The '{dtype}' dtype has no unit. Please pass in '{dtype}[ns]' instead." - raise ValueError(msg.format(dtype=dtype.name)) + msg = ( + f"The '{dtype.name}' dtype has no unit. Please pass in " + f"'{dtype.name}[ns]' instead." + ) + raise ValueError(msg) if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. @@ -972,7 +1018,7 @@ def soft_convert_objects( return values -def maybe_castable(arr): +def maybe_castable(arr) -> bool: # return False to force a non-fastpath # check datetime64[ns]/timedelta64[ns] are valid @@ -986,7 +1032,7 @@ def maybe_castable(arr): return arr.dtype.name not in _POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike(value, convert_dates=False): +def maybe_infer_to_datetimelike(value, convert_dates: bool = False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1093,7 +1139,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors="raise"): +def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -1112,8 +1158,8 @@ def maybe_cast_to_datetime(value, dtype, errors="raise"): # Force the dtype if needed. msg = ( - "The '{dtype}' dtype has no unit. " - "Please pass in '{dtype}[ns]' instead." + f"The '{dtype.name}' dtype has no unit. " + f"Please pass in '{dtype.name}[ns]' instead." ) if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): @@ -1122,13 +1168,10 @@ def maybe_cast_to_datetime(value, dtype, errors="raise"): # e.g., [ps], [fs], [as] if dtype <= np.dtype("M8[ns]"): if dtype.name == "datetime64": - raise ValueError(msg.format(dtype=dtype.name)) + raise ValueError(msg) dtype = _NS_DTYPE else: - raise TypeError( - "cannot convert datetimelike to " - "dtype [{dtype}]".format(dtype=dtype) - ) + raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: # our NaT doesn't support tz's @@ -1143,13 +1186,10 @@ def maybe_cast_to_datetime(value, dtype, errors="raise"): # e.g., [ps], [fs], [as] if dtype <= np.dtype("m8[ns]"): if dtype.name == "timedelta64": - raise ValueError(msg.format(dtype=dtype.name)) + raise ValueError(msg) dtype = _TD_DTYPE else: - raise TypeError( - "cannot convert timedeltalike to " - "dtype [{dtype}]".format(dtype=dtype) - ) + raise TypeError(f"cannot convert timedeltalike to dtype [{dtype}]") if is_scalar(value): if value == iNaT or isna(value): @@ -1201,7 +1241,7 @@ def maybe_cast_to_datetime(value, dtype, errors="raise"): return tslib.ints_to_pydatetime(ints) # we have a non-castable dtype that was passed - raise TypeError("Cannot cast datetime64 to {dtype}".format(dtype=dtype)) + raise TypeError(f"Cannot cast datetime64 to {dtype}") else: @@ -1282,7 +1322,7 @@ def find_common_type(types): def cast_scalar_to_array(shape, value, dtype=None): """ - create np.ndarray of specified shape and dtype, filled with values + Create np.ndarray of specified shape and dtype, filled with values. Parameters ---------- @@ -1308,7 +1348,7 @@ def cast_scalar_to_array(shape, value, dtype=None): return values -def construct_1d_arraylike_from_scalar(value, length, dtype): +def construct_1d_arraylike_from_scalar(value, length: int, dtype): """ create a np.ndarray / pandas type of specified shape and dtype filled with values @@ -1373,7 +1413,7 @@ def construct_1d_object_array_from_listlike(values): return result -def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): +def construct_1d_ndarray_preserving_na(values, dtype=None, copy: bool = False): """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1414,7 +1454,7 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): return subarr -def maybe_cast_to_integer_array(arr, dtype, copy=False): +def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1465,7 +1505,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): except OverflowError: raise OverflowError( "The elements provided in the data cannot all be " - "casted to the dtype {dtype}".format(dtype=dtype) + f"casted to the dtype {dtype}" ) if np.array_equal(arr, casted): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3f4ebc88c1c8a..5a007f28d63cb 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -6,7 +6,7 @@ from pandas._libs import algos, lib from pandas._libs.tslibs import conversion -from pandas.compat import PY36 +from pandas._typing import ArrayLike from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -18,7 +18,6 @@ ) from pandas.core.dtypes.generic import ( ABCCategorical, - ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, @@ -45,11 +44,8 @@ is_re_compilable, is_scalar, is_sequence, - is_string_like, ) -from pandas._typing import ArrayLike - _POSSIBLY_CAST_DTYPES = { np.dtype(t).name for t in [ @@ -175,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: try: return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) return arr.astype("float64", copy=copy) @@ -195,9 +193,7 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: TypeError: if the value isn't an int or can't be converted to one. """ if not is_scalar(value): - raise TypeError( - "Value needs to be a scalar value, was type {}".format(type(value)) - ) + raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") msg = "Wrong type {} for value {}" try: new_value = int(value) @@ -223,7 +219,7 @@ def classes_and_not_datetimelike(*klasses) -> Callable: ) -def is_object_dtype(arr_or_dtype): +def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. @@ -253,7 +249,7 @@ def is_object_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.object_)) -def is_sparse(arr): +def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. @@ -271,17 +267,13 @@ def is_sparse(arr): bool Whether or not the array-like is a pandas sparse array. - See Also - -------- - Series.to_dense : Return dense representation of a Series. - Examples -------- Returns `True` if the parameter is a 1-D pandas sparse array. - >>> is_sparse(pd.SparseArray([0, 0, 1, 0])) + >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0])) True - >>> is_sparse(pd.Series(pd.SparseArray([0, 0, 1, 0]))) + >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0]))) True Returns `False` if the parameter is not sparse. @@ -305,7 +297,7 @@ def is_sparse(arr): return isinstance(dtype, SparseDtype) -def is_scipy_sparse(arr): +def is_scipy_sparse(arr) -> bool: """ Check whether an array-like is a scipy.sparse.spmatrix instance. @@ -328,7 +320,7 @@ def is_scipy_sparse(arr): >>> from scipy.sparse import bsr_matrix >>> is_scipy_sparse(bsr_matrix([1, 2, 3])) True - >>> is_scipy_sparse(pd.SparseArray([1, 2, 3])) + >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3])) False """ @@ -340,6 +332,7 @@ def is_scipy_sparse(arr): except ImportError: _is_scipy_sparse = lambda _: False + assert _is_scipy_sparse is not None return _is_scipy_sparse(arr) @@ -376,125 +369,7 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_datetimetz(arr): - """ - Check whether an array-like is a datetime array-like with a timezone - component in its dtype. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime array-like with a - timezone component in its dtype. - - Examples - -------- - >>> is_datetimetz([1, 2, 3]) - False - - Although the following examples are both DatetimeIndex objects, - the first one returns False because it has no timezone component - unlike the second one, which returns True. - - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - False - >>> is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - True - - The object need not be a DatetimeIndex object. It just needs to have - a dtype which has a timezone component. - - >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") - >>> s = pd.Series([], dtype=dtype) - >>> is_datetimetz(s) - True - """ - - warnings.warn( - "'is_datetimetz' is deprecated and will be removed in a " - "future version. Use 'is_datetime64tz_dtype' instead.", - FutureWarning, - stacklevel=2, - ) - return is_datetime64tz_dtype(arr) - - -def is_offsetlike(arr_or_obj): - """ - Check if obj or all elements of list-like is DateOffset - - Parameters - ---------- - arr_or_obj : object - - Returns - ------- - boolean - Whether the object is a DateOffset or listlike of DatetOffsets - - Examples - -------- - >>> is_offsetlike(pd.DateOffset(days=1)) - True - >>> is_offsetlike('offset') - False - >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()]) - True - >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()])) - False - """ - if isinstance(arr_or_obj, ABCDateOffset): - return True - elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): - return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) - return False - - -def is_period(arr): - """ - Check whether an array-like is a periodical index. - - .. deprecated:: 0.24.0 - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a periodical index. - - Examples - -------- - >>> is_period([1, 2, 3]) - False - >>> is_period(pd.Index([1, 2, 3])) - False - >>> is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - True - """ - - warnings.warn( - "'is_period' is deprecated and will be removed in a future " - "version. Use 'is_period_dtype' or is_period_arraylike' " - "instead.", - FutureWarning, - stacklevel=2, - ) - - return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) - - -def is_datetime64_dtype(arr_or_dtype): +def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -525,7 +400,7 @@ def is_datetime64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.datetime64)) -def is_datetime64tz_dtype(arr_or_dtype): +def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. @@ -563,7 +438,7 @@ def is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.is_dtype(arr_or_dtype) -def is_timedelta64_dtype(arr_or_dtype): +def is_timedelta64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the timedelta64 dtype. @@ -594,7 +469,7 @@ def is_timedelta64_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes(np.timedelta64)) -def is_period_dtype(arr_or_dtype): +def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. @@ -628,7 +503,7 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) -def is_interval_dtype(arr_or_dtype): +def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. @@ -697,7 +572,7 @@ def is_categorical_dtype(arr_or_dtype) -> bool: return CategoricalDtype.is_dtype(arr_or_dtype) -def is_string_dtype(arr_or_dtype): +def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. @@ -727,13 +602,20 @@ def is_string_dtype(arr_or_dtype): """ # TODO: gh-15585: consider making the checks stricter. - def condition(dtype): - return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + def condition(dtype) -> bool: + return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + + def is_excluded_dtype(dtype) -> bool: + """ + These have kind = "O" but aren't string dtypes so need to be explicitly excluded + """ + is_excluded_checks = (is_period_dtype, is_interval_dtype) + return any(is_excluded(dtype) for is_excluded in is_excluded_checks) return _is_dtype(arr_or_dtype, condition) -def is_period_arraylike(arr): +def is_period_arraylike(arr) -> bool: """ Check whether an array-like is a periodical array-like or PeriodIndex. @@ -765,7 +647,7 @@ def is_period_arraylike(arr): return getattr(arr, "inferred_type", None) == "period" -def is_datetime_arraylike(arr): +def is_datetime_arraylike(arr) -> bool: """ Check whether an array-like is a datetime array-like or DatetimeIndex. @@ -800,55 +682,7 @@ def is_datetime_arraylike(arr): return getattr(arr, "inferred_type", None) == "datetime" -def is_datetimelike(arr): - """ - Check whether an array-like is a datetime-like array-like. - - Acceptable datetime-like objects are (but not limited to) datetime - indices, periodic indices, and timedelta indices. - - Parameters - ---------- - arr : array-like - The array-like to check. - - Returns - ------- - boolean - Whether or not the array-like is a datetime-like array-like. - - Examples - -------- - >>> is_datetimelike([1, 2, 3]) - False - >>> is_datetimelike(pd.Index([1, 2, 3])) - False - >>> is_datetimelike(pd.DatetimeIndex([1, 2, 3])) - True - >>> is_datetimelike(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - True - >>> is_datetimelike(pd.PeriodIndex([], freq="A")) - True - >>> is_datetimelike(np.array([], dtype=np.datetime64)) - True - >>> is_datetimelike(pd.Series([], dtype="timedelta64[ns]")) - True - >>> - >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern") - >>> s = pd.Series([], dtype=dtype) - >>> is_datetimelike(s) - True - """ - - return ( - is_datetime64_dtype(arr) - or is_datetime64tz_dtype(arr) - or is_timedelta64_dtype(arr) - or isinstance(arr, ABCPeriodIndex) - ) - - -def is_dtype_equal(source, target): +def is_dtype_equal(source, target) -> bool: """ Check if two dtypes are equal. @@ -938,7 +772,7 @@ def is_any_int_dtype(arr_or_dtype) -> bool: return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) -def is_integer_dtype(arr_or_dtype): +def is_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an integer dtype. @@ -993,7 +827,7 @@ def is_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) -def is_signed_integer_dtype(arr_or_dtype): +def is_signed_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a signed integer dtype. @@ -1050,7 +884,7 @@ def is_signed_integer_dtype(arr_or_dtype): return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) -def is_unsigned_integer_dtype(arr_or_dtype): +def is_unsigned_integer_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of an unsigned integer dtype. @@ -1099,7 +933,7 @@ def is_unsigned_integer_dtype(arr_or_dtype): ) -def is_int64_dtype(arr_or_dtype): +def is_int64_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the int64 dtype. @@ -1190,7 +1024,7 @@ def is_datetime64_any_dtype(arr_or_dtype) -> bool: return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) -def is_datetime64_ns_dtype(arr_or_dtype): +def is_datetime64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the datetime64[ns] dtype. @@ -1240,7 +1074,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE -def is_timedelta64_ns_dtype(arr_or_dtype): +def is_timedelta64_ns_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the timedelta64[ns] dtype. @@ -1271,7 +1105,7 @@ def is_timedelta64_ns_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype == _TD_DTYPE) -def is_datetime_or_timedelta_dtype(arr_or_dtype): +def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a timedelta64 or datetime64 dtype. @@ -1314,9 +1148,6 @@ def _is_unorderable_exception(e: TypeError) -> bool: """ Check if the exception raised is an unorderable exception. - The error message differs for 3 <= PY <= 3.5 and PY >= 3.6, so - we need to condition based on Python version. - Parameters ---------- e : Exception or sub-class @@ -1324,20 +1155,16 @@ def _is_unorderable_exception(e: TypeError) -> bool: Returns ------- - boolean + bool Whether or not the exception raised is an unorderable exception. """ - - if PY36: - return "'>' not supported between instances of" in str(e) - - return "unorderable" in str(e) + return "'>' not supported between instances of" in str(e) +# This exists to silence numpy deprecation warnings, see GH#29553 def is_numeric_v_string_like(a, b): """ Check if we are comparing a string-like object to a numeric ndarray. - NumPy doesn't like to compare such objects, especially numeric arrays and scalar string-likes. @@ -1383,8 +1210,8 @@ def is_numeric_v_string_like(a, b): is_a_string_array = is_a_array and is_string_like_dtype(a) is_b_string_array = is_b_array and is_string_like_dtype(b) - is_a_scalar_string_like = not is_a_array and is_string_like(a) - is_b_scalar_string_like = not is_b_array and is_string_like(b) + is_a_scalar_string_like = not is_a_array and isinstance(a, str) + is_b_scalar_string_like = not is_b_array and isinstance(b, str) return ( (is_a_numeric_array and is_b_scalar_string_like) @@ -1394,10 +1221,10 @@ def is_numeric_v_string_like(a, b): ) +# This exists to silence numpy deprecation warnings, see GH#29553 def is_datetimelike_v_numeric(a, b): """ Check if we are comparing a datetime-like object to a numeric object. - By "numeric," we mean an object that is either of an int or float dtype. Parameters @@ -1447,13 +1274,12 @@ def is_numeric(x): """ return is_integer_dtype(x) or is_float_dtype(x) - is_datetimelike = needs_i8_conversion - return (is_datetimelike(a) and is_numeric(b)) or ( - is_datetimelike(b) and is_numeric(a) + return (needs_i8_conversion(a) and is_numeric(b)) or ( + needs_i8_conversion(b) and is_numeric(a) ) -def needs_i8_conversion(arr_or_dtype): +def needs_i8_conversion(arr_or_dtype) -> bool: """ Check whether the array or dtype should be converted to int64. @@ -1497,7 +1323,7 @@ def needs_i8_conversion(arr_or_dtype): ) -def is_numeric_dtype(arr_or_dtype): +def is_numeric_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a numeric dtype. @@ -1540,7 +1366,7 @@ def is_numeric_dtype(arr_or_dtype): ) -def is_string_like_dtype(arr_or_dtype): +def is_string_like_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a string-like dtype. @@ -1572,7 +1398,7 @@ def is_string_like_dtype(arr_or_dtype): return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) -def is_float_dtype(arr_or_dtype): +def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. @@ -1643,7 +1469,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: True >>> is_bool_dtype(pd.Categorical([True, False])) True - >>> is_bool_dtype(pd.SparseArray([True, False])) + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) True """ if arr_or_dtype is None: @@ -1671,10 +1497,13 @@ def is_bool_dtype(arr_or_dtype) -> bool: return issubclass(dtype.type, np.bool_) -def is_extension_type(arr): +def is_extension_type(arr) -> bool: """ Check whether an array-like is of a pandas extension class instance. + .. deprecated:: 1.0.0 + Use ``is_extension_array_dtype`` instead. + Extension classes include categoricals, pandas sparse objects (i.e. classes represented within the pandas library and not ones external to it like scipy sparse matrices), and datetime-like arrays. @@ -1702,7 +1531,7 @@ def is_extension_type(arr): True >>> is_extension_type(pd.Series(cat)) True - >>> is_extension_type(pd.SparseArray([1, 2, 3])) + >>> is_extension_type(pd.arrays.SparseArray([1, 2, 3])) True >>> from scipy.sparse import bsr_matrix >>> is_extension_type(bsr_matrix([1, 2, 3])) @@ -1717,6 +1546,12 @@ def is_extension_type(arr): >>> is_extension_type(s) True """ + warnings.warn( + "'is_extension_type' is deprecated and will be removed in a future " + "version. Use 'is_extension_array_dtype' instead.", + FutureWarning, + stacklevel=2, + ) if is_categorical(arr): return True @@ -1727,7 +1562,7 @@ def is_extension_type(arr): return False -def is_extension_array_dtype(arr_or_dtype): +def is_extension_array_dtype(arr_or_dtype) -> bool: """ Check if an object is a pandas extension array type. @@ -1998,10 +1833,12 @@ def _validate_date_like_dtype(dtype) -> None: try: typ = np.datetime_data(dtype)[0] except ValueError as e: - raise TypeError("{error}".format(error=e)) + raise TypeError(e) if typ != "generic" and typ != "ns": - msg = "{name!r} is too specific of a frequency, try passing {type!r}" - raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__)) + raise ValueError( + f"{repr(dtype.name)} is too specific of a frequency, " + f"try passing {repr(dtype.type.__name__)}" + ) def pandas_dtype(dtype): @@ -2037,7 +1874,7 @@ def pandas_dtype(dtype): npdtype = np.dtype(dtype) except SyntaxError: # np.dtype uses `eval` which can raise SyntaxError - raise TypeError("data type '{}' not understood".format(dtype)) + raise TypeError(f"data type '{dtype}' not understood") # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will @@ -2049,6 +1886,6 @@ def pandas_dtype(dtype): # here and `dtype` is an array return npdtype elif npdtype.kind == "O": - raise TypeError("dtype '{}' not understood".format(dtype)) + raise TypeError(f"dtype '{dtype}' not understood") return npdtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f2176f573207c..cd4b5af4588e5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -69,7 +69,7 @@ def get_dtype_kinds(l): return typs -def concat_compat(to_concat, axis=0): +def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a @@ -88,7 +88,7 @@ def concat_compat(to_concat, axis=0): # filter empty arrays # 1-d dtypes always are included here - def is_nonempty(x): + def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 @@ -137,7 +137,7 @@ def is_nonempty(x): return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis=0): +def concat_categorical(to_concat, axis: int = 0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -183,15 +183,18 @@ def concat_categorical(to_concat, axis=0): return result -def union_categoricals(to_union, sort_categories=False, ignore_order=False): +def union_categoricals( + to_union, sort_categories: bool = False, ignore_order: bool = False +): """ - Combine list-like of Categorical-like, unioning categories. All - categories must have the same dtype. + Combine list-like of Categorical-like, unioning categories. + + All categories must have the same dtype. Parameters ---------- - to_union : list-like of Categorical, CategoricalIndex, - or Series with dtype='category' + to_union : list-like + Categorical, CategoricalIndex, or Series with dtype='category'. sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. @@ -201,7 +204,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): Returns ------- - result : Categorical + Categorical Raises ------ @@ -217,7 +220,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): ----- To learn more about categories, see `link - `__ + `__ Examples -------- @@ -354,7 +357,7 @@ def _maybe_unwrap(x): return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) -def _concatenate_2d(to_concat, axis): +def _concatenate_2d(to_concat, axis: int): # coerce to 2d if needed & concatenate if axis == 1: to_concat = [np.atleast_2d(x) for x in to_concat] diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7dca588e33839..466ed815e8e5a 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,27 +1,20 @@ """ define extension dtypes """ import re from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast -import warnings import numpy as np import pytz from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones - -from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass - from pandas._typing import Ordered -from .base import ExtensionDtype -from .inference import is_bool, is_list_like +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass +from pandas.core.dtypes.inference import is_bool, is_list_like str_type = str -# GH26403: sentinel value used for the default value of ordered in the -# CategoricalDtype constructor to detect when ordered=None is explicitly passed -ordered_sentinel = object() # type: object - def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: """ @@ -51,7 +44,7 @@ def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: class Registry: """ - Registry for dtype inference + Registry for dtype inference. The registry allows one to map a string repr of a extension dtype to an extension dtype. The string alias can be used in several @@ -66,7 +59,7 @@ class Registry: """ def __init__(self): - self.dtypes = [] # type: List[Type[ExtensionDtype]] + self.dtypes: List[Type[ExtensionDtype]] = [] def register(self, dtype: Type[ExtensionDtype]) -> None: """ @@ -119,21 +112,21 @@ class PandasExtensionDtype(ExtensionDtype): THIS IS NOT A REAL NUMPY DTYPE """ - type = None # type: Any - kind = None # type: Any + type: Any + kind: Any # The Any type annotations above are here only because mypy seems to have a # problem dealing with with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None - str = None # type: Optional[str_type] + str: Optional[str_type] = None num = 100 - shape = tuple() # type: Tuple[int, ...] + shape: Tuple[int, ...] = tuple() itemsize = 8 base = None isbuiltin = 0 isnative = 0 - _cache = {} # type: Dict[str_type, 'PandasExtensionDtype'] + _cache: Dict[str_type, "PandasExtensionDtype"] = {} def __str__(self) -> str_type: """ @@ -179,7 +172,11 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ---------- categories : sequence, optional Must be unique, and must not contain any nulls. - ordered : bool, default False + ordered : bool or None, default False + Whether or not this categorical is treated as a ordered categorical. + None can be used to maintain the ordered value of existing categoricals when + used in operations that combine categoricals, e.g. astype, and will resolve to + False if there is no existing ordered to maintain. Attributes ---------- @@ -214,18 +211,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): # TODO: Document public vs. private API name = "category" - type = CategoricalDtypeType # type: Type[CategoricalDtypeType] - kind = "O" # type: str_type + type: Type[CategoricalDtypeType] = CategoricalDtypeType + kind: str_type = "O" str = "|O08" base = np.dtype("O") - _metadata = ("categories", "ordered", "_ordered_from_sentinel") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] - - def __init__( - self, categories=None, ordered: Union[Ordered, object] = ordered_sentinel - ): - # TODO(GH26403): Set type of ordered to Ordered - ordered = cast(Ordered, ordered) + _metadata = ("categories", "ordered") + _cache: Dict[str_type, PandasExtensionDtype] = {} + + def __init__(self, categories=None, ordered: Ordered = False): self._finalize(categories, ordered, fastpath=False) @classmethod @@ -319,8 +312,7 @@ def _from_values_or_dtype( if dtype == "category": dtype = CategoricalDtype(categories, ordered) else: - msg = "Unknown dtype {dtype!r}" - raise ValueError(msg.format(dtype=dtype)) + raise ValueError(f"Unknown dtype {repr(dtype)}") elif categories is not None or ordered is not None: raise ValueError( "Cannot specify `categories` or `ordered` together with `dtype`." @@ -339,17 +331,45 @@ def _from_values_or_dtype( return dtype + @classmethod + def construct_from_string(cls, string: str_type) -> "CategoricalDtype": + """ + Construct a CategoricalDtype from a string. + + Parameters + ---------- + string : str + Must be the string "category" in order to be successfully constructed. + + Returns + ------- + CategoricalDtype + Instance of the dtype. + + Raises + ------ + TypeError + If a CategoricalDtype cannot be constructed from the input. + """ + if not isinstance(string, str): + raise TypeError(f"Expects a string, got {type(string)}") + if string != cls.name: + raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") + + # need ordered=None to ensure that operations specifying dtype="category" don't + # override the ordered value for existing categoricals + return cls(ordered=None) + def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: - if ordered is not None and ordered is not ordered_sentinel: + if ordered is not None: self.validate_ordered(ordered) if categories is not None: categories = self.validate_categories(categories, fastpath=fastpath) self._categories = categories - self._ordered = ordered if ordered is not ordered_sentinel else None - self._ordered_from_sentinel = ordered is ordered_sentinel + self._ordered = ordered def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the @@ -357,18 +377,17 @@ def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: # pickle -> need to set the settable private ones here (see GH26067) self._categories = state.pop("categories", None) self._ordered = state.pop("ordered", False) - self._ordered_from_sentinel = state.pop("_ordered_from_sentinel", False) def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative # space for when we have unknown categories to avoid a conflict if self.categories is None: - if self._ordered: + if self.ordered: return -1 else: return -2 # We *do* want to include the real self.ordered here - return int(self._hash_categories(self.categories, self._ordered)) + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other: Any) -> bool: """ @@ -387,7 +406,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, "_ordered") and hasattr(other, "categories")): + elif not (hasattr(other, "ordered") and hasattr(other, "categories")): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -396,10 +415,10 @@ def __eq__(self, other: Any) -> bool: # CDT(., .) = CDT(None, False) and *all* # CDT(., .) = CDT(None, True). return True - elif self._ordered or other._ordered: + elif self.ordered or other.ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return (self._ordered == other._ordered) and self.categories.equals( + return (self.ordered == other.ordered) and self.categories.equals( other.categories ) else: @@ -415,13 +434,13 @@ def __eq__(self, other: Any) -> bool: return True return hash(self) == hash(other) - def __repr__(self): - tpl = "CategoricalDtype(categories={}ordered={})" + def __repr__(self) -> str_type: + tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: - data = self.categories._format_data(name=self.__class__.__name__) - return tpl.format(data, self._ordered) + data = self.categories._format_data(name=type(self).__name__) + return tpl.format(data=data, ordered=self.ordered) @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -465,7 +484,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: @classmethod def construct_array_type(cls): """ - Return the array type associated with this dtype + Return the array type associated with this dtype. Returns ------- @@ -512,8 +531,9 @@ def validate_categories(categories, fastpath: bool = False): from pandas.core.indexes.base import Index if not fastpath and not is_list_like(categories): - msg = "Parameter 'categories' must be list-like, was {!r}" - raise TypeError(msg.format(categories)) + raise TypeError( + f"Parameter 'categories' must be list-like, was {repr(categories)}" + ) elif not isinstance(categories, ABCIndexClass): categories = Index(categories, tupleize_cols=False) @@ -549,35 +569,19 @@ def update_dtype( # dtype='category' should not change anything return self elif not self.is_dtype(dtype): - msg = ( - "a CategoricalDtype must be passed to perform an update, " - "got {dtype!r}" - ).format(dtype=dtype) - raise ValueError(msg) + raise ValueError( + f"a CategoricalDtype must be passed to perform an update, " + f"got {repr(dtype)}" + ) else: # from here on, dtype is a CategoricalDtype dtype = cast(CategoricalDtype, dtype) - # dtype is CDT: keep current categories/ordered if None - new_categories = dtype.categories - if new_categories is None: - new_categories = self.categories - - new_ordered = dtype._ordered - new_ordered_from_sentinel = dtype._ordered_from_sentinel - if new_ordered is None: - # maintain existing ordered if new dtype has ordered=None - new_ordered = self._ordered - if self._ordered and new_ordered_from_sentinel: - # only warn if we'd actually change the existing behavior - msg = ( - "Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version, which will cause the resulting categorical's " - "`ordered` attribute to change to False; `ordered=True`" - " must be explicitly passed in order to be retained" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) + # update categories/ordered unless they've been explicitly passed as None + new_categories = ( + dtype.categories if dtype.categories is not None else self.categories + ) + new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered return CategoricalDtype(new_categories, new_ordered) @@ -593,16 +597,6 @@ def ordered(self) -> Ordered: """ Whether the categories have an ordered relationship. """ - # TODO: remove if block when ordered=None as default is deprecated - if self._ordered_from_sentinel and self._ordered is None: - # warn when accessing ordered if ordered=None and None was not - # explicitly passed to the constructor - msg = ( - "Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version; `ordered=None` must be explicitly passed." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @property @@ -650,15 +644,15 @@ class DatetimeTZDtype(PandasExtensionDtype): datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] """ - type = Timestamp # type: Type[Timestamp] - kind = "M" # type: str_type + type: Type[Timestamp] = Timestamp + kind: str_type = "M" str = "|M8[ns]" num = 101 base = np.dtype("M8[ns]") na_value = NaT _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __init__(self, unit="ns", tz=None): if isinstance(unit, DatetimeTZDtype): @@ -672,11 +666,11 @@ def __init__(self, unit="ns", tz=None): unit = result.unit tz = result.tz msg = ( - "Passing a dtype alias like 'datetime64[ns, {tz}]' " - "to DatetimeTZDtype is deprecated. Use " + f"Passing a dtype alias like 'datetime64[ns, {tz}]' " + "to DatetimeTZDtype is no longer supported. Use " "'DatetimeTZDtype.construct_from_string()' instead." ) - warnings.warn(msg.format(tz=tz), FutureWarning, stacklevel=2) + raise ValueError(msg) else: raise ValueError("DatetimeTZDtype only supports ns units") @@ -708,7 +702,7 @@ def tz(self): @classmethod def construct_array_type(cls): """ - Return the array type associated with this dtype + Return the array type associated with this dtype. Returns ------- @@ -719,7 +713,7 @@ def construct_array_type(cls): return DatetimeArray @classmethod - def construct_from_string(cls, string): + def construct_from_string(cls, string: str_type): """ Construct a DatetimeTZDtype from a string. @@ -736,7 +730,7 @@ def construct_from_string(cls, string): datetime64[ns, UTC] """ if isinstance(string, str): - msg = "Could not construct DatetimeTZDtype from '{}'" + msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'" match = cls._match.match(string) if match: d = match.groupdict() @@ -747,25 +741,25 @@ def construct_from_string(cls, string): # pytz timezone (actually pytz.UnknownTimeZoneError). # TypeError if we pass a nonsense tz; # ValueError if we pass a unit other than "ns" - raise TypeError(msg.format(string)) from err - raise TypeError(msg.format(string)) + raise TypeError(msg) from err + raise TypeError(msg) - raise TypeError("Could not construct DatetimeTZDtype") + raise TypeError("Cannot construct a 'DatetimeTZDtype'") - def __str__(self): - return "datetime64[{unit}, {tz}]".format(unit=self.unit, tz=self.tz) + def __str__(self) -> str_type: + return f"datetime64[{self.unit}, {self.tz}]" @property - def name(self): + def name(self) -> str_type: """A string representation of the dtype.""" return str(self) - def __hash__(self): + def __hash__(self) -> int: # make myself hashable # TODO: update this. return hash(str(self)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, str): return other == self.name @@ -812,14 +806,14 @@ class PeriodDtype(PandasExtensionDtype): period[M] """ - type = Period # type: Type[Period] - kind = "O" # type: str_type + type: Type[Period] = Period + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 102 _metadata = ("freq",) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, freq=None): """ @@ -887,24 +881,28 @@ def construct_from_string(cls, string): return cls(freq=string) except ValueError: pass - raise TypeError("could not construct PeriodDtype") + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) - def __str__(self): + def __str__(self) -> str_type: return self.name @property - def name(self): - return "period[{freq}]".format(freq=self.freq.freqstr) + def name(self) -> str_type: + return f"period[{self.freq.freqstr}]" @property def na_value(self): return NaT - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, str): return other == self.name or other == self.name.title() @@ -917,7 +915,7 @@ def __setstate__(self, state): self._freq = state["freq"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -940,10 +938,37 @@ def is_dtype(cls, dtype): @classmethod def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ from pandas.core.arrays import PeriodArray return PeriodArray + def __from_arrow__(self, array): + """Construct PeriodArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import PeriodArray + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") + parr = PeriodArray(data.copy(), freq=self.freq, copy=False) + parr[~mask] = NaT + results.append(parr) + + return PeriodArray._concat_same_type(results) + @register_extension_dtype class IntervalDtype(PandasExtensionDtype): @@ -972,13 +997,13 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind = None # type: Optional[str_type] + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 103 _metadata = ("subtype",) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") - _cache = {} # type: Dict[str_type, PandasExtensionDtype] + _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): from pandas.core.dtypes.common import ( @@ -1034,7 +1059,7 @@ def subtype(self): @classmethod def construct_array_type(cls): """ - Return the array type associated with this dtype + Return the array type associated with this dtype. Returns ------- @@ -1051,13 +1076,13 @@ def construct_from_string(cls, string): if its not possible """ if not isinstance(string, str): - msg = "a string needs to be passed, got type {typ}" - raise TypeError(msg.format(typ=type(string))) + raise TypeError(f"a string needs to be passed, got type {type(string)}") if string.lower() == "interval" or cls._match.search(string) is not None: return cls(string) msg = ( + f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n" "Incorrectly formatted string passed to constructor. " "Valid formats include Interval or Interval[dtype] " "where dtype is numeric, datetime, or timedelta" @@ -1068,16 +1093,16 @@ def construct_from_string(cls, string): def type(self): return Interval - def __str__(self): + def __str__(self) -> str_type: if self.subtype is None: return "interval" - return "interval[{subtype}]".format(subtype=self.subtype) + return f"interval[{self.subtype}]" - def __hash__(self): + def __hash__(self) -> int: # make myself hashable return hash(str(self)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, str): return other.lower() in (self.name.lower(), str(self).lower()) elif not isinstance(other, IntervalDtype): @@ -1097,7 +1122,7 @@ def __setstate__(self, state): self._subtype = state["subtype"] @classmethod - def is_dtype(cls, dtype): + def is_dtype(cls, dtype) -> bool: """ Return a boolean if we if the passed type is an actual dtype that we can match (via string or type) @@ -1115,3 +1140,22 @@ def is_dtype(cls, dtype): else: return False return super().is_dtype(dtype) + + def __from_arrow__(self, array): + """Construct IntervalArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + left = np.asarray(arr.storage.field("left"), dtype=self.subtype) + right = np.asarray(arr.storage.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) + results.append(iarr) + + return IntervalArray._concat_same_type(results) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 2518f330b26a3..4c3f8b7374465 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -4,8 +4,11 @@ # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): - @classmethod - def _check(cls, inst): + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore + def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) @@ -74,7 +77,7 @@ def _check(cls, inst): class _ABCGeneric(type): - def __instancecheck__(cls, inst): + def __instancecheck__(cls, inst) -> bool: return hasattr(inst, "_data") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index e69e703f3a96c..9e9278052e35d 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -26,7 +26,7 @@ is_list_like = lib.is_list_like -def is_number(obj): +def is_number(obj) -> bool: """ Check if the object is a number. @@ -67,31 +67,7 @@ def is_number(obj): return isinstance(obj, (Number, np.number)) -def is_string_like(obj): - """ - Check if the object is a string. - - Parameters - ---------- - obj : The object to check - - Examples - -------- - >>> is_string_like("foo") - True - >>> is_string_like(1) - False - - Returns - ------- - is_str_like : bool - Whether `obj` is a string or not. - """ - - return isinstance(obj, str) - - -def _iterable_not_string(obj): +def _iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -117,7 +93,7 @@ def _iterable_not_string(obj): return isinstance(obj, abc.Iterable) and not isinstance(obj, str) -def is_iterator(obj): +def is_iterator(obj) -> bool: """ Check if the object is an iterator. @@ -151,7 +127,7 @@ def is_iterator(obj): return hasattr(obj, "__next__") -def is_file_like(obj): +def is_file_like(obj) -> bool: """ Check if the object is a file-like object. @@ -189,7 +165,7 @@ def is_file_like(obj): return True -def is_re(obj): +def is_re(obj) -> bool: """ Check if the object is a regex pattern instance. @@ -212,7 +188,7 @@ def is_re(obj): return isinstance(obj, Pattern) -def is_re_compilable(obj): +def is_re_compilable(obj) -> bool: """ Check if the object can be compiled into a regex pattern instance. @@ -241,7 +217,7 @@ def is_re_compilable(obj): return True -def is_array_like(obj): +def is_array_like(obj) -> bool: """ Check if the object is array-like. @@ -274,7 +250,7 @@ def is_array_like(obj): return is_list_like(obj) and hasattr(obj, "dtype") -def is_nested_list_like(obj): +def is_nested_list_like(obj) -> bool: """ Check if the object is list-like, and that all of its elements are also list-like. @@ -320,7 +296,7 @@ def is_nested_list_like(obj): ) -def is_dict_like(obj): +def is_dict_like(obj) -> bool: """ Check if the object is dict-like. @@ -352,7 +328,7 @@ def is_dict_like(obj): ) -def is_named_tuple(obj): +def is_named_tuple(obj) -> bool: """ Check if the object is a named tuple. @@ -379,7 +355,7 @@ def is_named_tuple(obj): return isinstance(obj, tuple) and hasattr(obj, "_fields") -def is_hashable(obj): +def is_hashable(obj) -> bool: """ Return True if hash(obj) will succeed, False otherwise. @@ -416,7 +392,7 @@ def is_hashable(obj): return True -def is_sequence(obj): +def is_sequence(obj) -> bool: """ Check if the object is a sequence of objects. String types are not included as sequences here. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 322011eb8e263..fb579f2f58a57 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from .common import ( +from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, ensure_object, @@ -17,7 +17,6 @@ is_complex_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetimelike, is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, @@ -32,7 +31,7 @@ needs_i8_conversion, pandas_dtype, ) -from .generic import ( +from pandas.core.dtypes.generic import ( ABCDatetimeArray, ABCExtensionArray, ABCGeneric, @@ -41,7 +40,7 @@ ABCSeries, ABCTimedeltaArray, ) -from .inference import is_list_like +from pandas.core.dtypes.inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar @@ -81,6 +80,9 @@ def isna(obj): >>> pd.isna('dog') False + >>> pd.isna(pd.NA) + True + >>> pd.isna(np.nan) True @@ -159,7 +161,8 @@ def _isna_new(obj): def _isna_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. + """ + Detect missing values, treating None, NaN, INF, -INF as null. Parameters ---------- @@ -176,7 +179,7 @@ def _isna_old(obj): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, type): return False - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)): return _isna_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=_isna_old)) @@ -192,7 +195,9 @@ def _isna_old(obj): def _use_inf_as_na(key): - """Option change callback for na/inf behaviour + """ + Option change callback for na/inf behaviour. + Choose which replacement for numpy.isnan / -numpy.isfinite is used. Parameters @@ -207,7 +212,7 @@ def _use_inf_as_na(key): This approach to setting global module values is discussed and approved here: - * http://stackoverflow.com/questions/4859217/ + * https://stackoverflow.com/questions/4859217/ programmatically-creating-variables-in-python/4859312#4859312 """ flag = get_option(key) @@ -325,6 +330,9 @@ def notna(obj): >>> pd.notna('dog') True + >>> pd.notna(pd.NA) + False + >>> pd.notna(np.nan) False @@ -374,7 +382,7 @@ def notna(obj): notnull = notna -def _isna_compat(arr, fill_value=np.nan): +def _isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -391,7 +399,7 @@ def _isna_compat(arr, fill_value=np.nan): return True -def array_equivalent(left, right, strict_nan=False): +def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -442,17 +450,22 @@ def array_equivalent(left, right, strict_nan=False): if left_value is NaT and right_value is not NaT: return False + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False else: try: - if np.any(left_value != right_value): + if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False + elif "boolean value of NA is ambiguous" in str(err): + return False raise return True @@ -464,12 +477,12 @@ def array_equivalent(left, right, strict_nan=False): return True return ((left == right) | (isna(left) & isna(right))).all() - # numpy will will not allow this type of datetimelike vs integer comparison elif is_datetimelike_v_numeric(left, right): + # GH#29553 avoid numpy deprecation warning return False - # M8/m8 - elif needs_i8_conversion(left) and needs_i8_conversion(right): + elif needs_i8_conversion(left) or needs_i8_conversion(right): + # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False @@ -494,7 +507,7 @@ def _infer_fill_value(val): if not is_list_like(val): val = [val] val = np.array(val, copy=False) - if is_datetimelike(val): + if needs_i8_conversion(val): return np.array("NaT", dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(ensure_object(val), skipna=False) @@ -514,7 +527,7 @@ def _maybe_fill(arr, fill_value=np.nan): return arr -def na_value_for_dtype(dtype, compat=True): +def na_value_for_dtype(dtype, compat: bool = True): """ Return a dtype compat na value @@ -572,7 +585,7 @@ def remove_na_arraylike(arr): return arr[notna(lib.values_from_object(arr))] -def is_valid_nat_for_dtype(obj, dtype): +def is_valid_nat_for_dtype(obj, dtype) -> bool: """ isna check that excludes incompatible dtypes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 40efc4c65476a..5ad133f9e21a4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9,12 +9,15 @@ labeling information """ import collections -from collections import OrderedDict, abc +from collections import abc from io import StringIO import itertools import sys from textwrap import dedent from typing import ( + IO, + TYPE_CHECKING, + Any, FrozenSet, Hashable, Iterable, @@ -25,6 +28,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -34,7 +38,9 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas.compat import PY36 +from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer +from pandas.compat import PY37 +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -67,11 +73,9 @@ ensure_platform_int, infer_dtype_from_object, is_bool_dtype, - is_datetime64_any_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, - is_extension_type, is_float_dtype, is_hashable, is_integer, @@ -79,7 +83,6 @@ is_iterator, is_list_like, is_named_tuple, - is_nested_list_like, is_object_dtype, is_scalar, is_sequence, @@ -93,15 +96,15 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas._typing import Axes, Dtype, FilePathOrBuffer from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import Index, ensure_index, ensure_index_from_sequences +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels from pandas.core.indexes.period import PeriodIndex @@ -120,10 +123,14 @@ from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series +from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt from pandas.io.formats.printing import pprint_thing import pandas.plotting +if TYPE_CHECKING: + from pandas.io.formats.style import Styler + # --------------------------------------------------------------------- # Docstring templates @@ -139,11 +146,12 @@ Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index - levels and/or column labels + levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels + levels and/or index labels. .. versionchanged:: 0.23.0 + Allow specifying index or column level names.""", versionadded_to_excel="", optional_labels="""labels : array-like, optional @@ -345,8 +353,9 @@ class DataFrame(NDFrame): -------- DataFrame.from_records : Constructor from tuples, also record arrays. DataFrame.from_dict : From dicts of Series, arrays, or dicts. - DataFrame.from_items : From sequence of (key, value) pairs - read_csv, pandas.read_table, pandas.read_clipboard. + read_csv + read_table + read_clipboard Examples -------- @@ -385,15 +394,15 @@ class DataFrame(NDFrame): 2 7 8 9 """ + _typ = "dataframe" + @property def _constructor(self) -> Type["DataFrame"]: return DataFrame - _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset( - ["from_items"] - ) # type: FrozenSet[str] - _accessors = set() # type: Set[str] + _constructor_sliced: Type[Series] = Series + _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _accessors: Set[str] = {"sparse"} @property def _constructor_expanddim(self): @@ -456,7 +465,7 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): - if not isinstance(data, abc.Sequence): + if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: @@ -485,7 +494,7 @@ def __init__( except (ValueError, TypeError) as e: exc = TypeError( "DataFrame constructor called with " - "incompatible data and dtype: {e}".format(e=e) + f"incompatible data and dtype: {e}" ) raise exc from e @@ -599,7 +608,6 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: users expect. display.max_columns remains in effect. GH3541, GH3573 """ - width, height = console.get_console_size() max_columns = get_option("display.max_columns") nb_columns = len(self.columns) @@ -814,7 +822,7 @@ def to_string( # ---------------------------------------------------------------------- @property - def style(self): + def style(self) -> "Styler": """ Returns a Styler object. @@ -858,9 +866,9 @@ def style(self): ... index=['panda', 'polar', 'koala']) >>> df species population - panda bear 1864 - polar bear 22000 - koala marsupial 80000 + panda bear 1864 + polar bear 22000 + koala marsupial 80000 >>> for label, content in df.items(): ... print('label:', label) ... print('content:', content, sep='\n') @@ -889,10 +897,10 @@ def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield k, self._ixs(i, axis=1) @Appender(_shared_docs["items"]) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield from self.items() - def iterrows(self): + def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. @@ -973,7 +981,8 @@ def itertuples(self, index=True, name="Pandas"): ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. - With a large number of columns (>255), regular tuples are returned. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). Examples -------- @@ -1016,15 +1025,16 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python 3 supports at most 255 arguments to constructor - if name is not None and len(self.columns) + index < 256: + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples return zip(*arrays) - def __len__(self): + def __len__(self) -> int: """ Returns length of info axis, but here we use the index. """ @@ -1122,8 +1132,7 @@ def dot(self, other): rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: raise ValueError( - "Dot product shape mismatch, " - "{s} vs {r}".format(s=lvals.shape, r=rvals.shape) + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" ) if isinstance(other, DataFrame): @@ -1139,7 +1148,7 @@ def dot(self, other): else: return Series(result, index=left.index) else: # pragma: no cover - raise TypeError("unsupported type: {oth}".format(oth=type(other))) + raise TypeError(f"unsupported type: {type(other)}") def __matmul__(self, other): """ @@ -1157,7 +1166,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ Construct DataFrame from dict of array-like or dicts. @@ -1237,7 +1246,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1427,7 +1436,7 @@ def to_dict(self, orient="dict", into=dict): for t in self.itertuples(name=None) ) else: - raise ValueError("orient '{o}' not understood".format(o=orient)) + raise ValueError(f"orient '{orient}' not understood") def to_gbq( self, @@ -1441,9 +1450,7 @@ def to_gbq( location=None, progress_bar=True, credentials=None, - verbose=None, - private_key=None, - ): + ) -> None: """ Write a DataFrame to a Google BigQuery table. @@ -1471,7 +1478,7 @@ def to_gbq( Behavior when the destination table exists. Value can be one of: ``'fail'`` - If table exists, do nothing. + If table exists raise pandas_gbq.gbq.TableCreationError. ``'replace'`` If table exists, drop it, recreate it, and insert data. ``'append'`` @@ -1517,21 +1524,6 @@ def to_gbq( *New in version 0.8.0 of pandas-gbq*. .. versionadded:: 0.24.0 - verbose : bool, deprecated - Deprecated in pandas-gbq version 0.4.0. Use the `logging module - to adjust verbosity instead - `__. - private_key : str, deprecated - Deprecated in pandas-gbq version 0.8.0. Use the ``credentials`` - parameter and - :func:`google.oauth2.service_account.Credentials.from_service_account_info` - or - :func:`google.oauth2.service_account.Credentials.from_service_account_file` - instead. - - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. Jupyter/IPython notebook on remote host). See Also -------- @@ -1552,8 +1544,6 @@ def to_gbq( location=location, progress_bar=progress_bar, credentials=credentials, - verbose=verbose, - private_key=private_key, ) @classmethod @@ -1565,7 +1555,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ): + ) -> "DataFrame": """ Convert structured or record ndarray to DataFrame. @@ -1688,8 +1678,8 @@ def from_records( return cls(mgr) def to_records( - self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None - ): + self, index=True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -1701,11 +1691,6 @@ def to_records( index : bool, default True Include index in resulting record array, stored in 'index' field or using the index label, if set. - convert_datetime64 : bool, default None - .. deprecated:: 0.23.0 - - Whether to convert the index to datetime.datetime if it is a - DatetimeIndex. column_dtypes : str, type, dict, default None .. versionadded:: 0.24.0 @@ -1774,30 +1759,18 @@ def to_records( rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('I', 'S2'), ('A', '>> index_dtypes = ">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], dtype=[('I', 'S1'), ('A', '` - instead. - :meth:`DataFrame.from_dict(OrderedDict(items)) ` - may be used to preserve the key order. - - Convert (key, value) pairs to DataFrame. The keys will be the axis - index (usually the columns, but depends on the specified - orientation). The values should be arrays or Series. - - Parameters - ---------- - items : sequence of (key, value) pairs - Values should be arrays or Series. - columns : sequence of column labels, optional - Must be passed if orient='index'. - orient : {'columns', 'index'}, default 'columns' - The "orientation" of the data. If the keys of the - input correspond to column labels, pass 'columns' - (default). Otherwise if the keys correspond to the index, - pass 'index'. - - Returns - ------- - DataFrame - """ - - warnings.warn( - "from_items is deprecated. Please use " - "DataFrame.from_dict(dict(items), ...) instead. " - "DataFrame.from_dict(OrderedDict(items)) may be used to " - "preserve the key order.", - FutureWarning, - stacklevel=2, - ) - - keys, values = zip(*items) - - if orient == "columns": - if columns is not None: - columns = ensure_index(columns) - - idict = dict(items) - if len(idict) < len(items): - if not columns.equals(ensure_index(keys)): - raise ValueError( - "With non-unique item names, passed " - "columns must be identical" - ) - arrays = values - else: - arrays = [idict[k] for k in columns if k in idict] - else: - columns = ensure_index(keys) - arrays = values - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - return cls._from_arrays(arrays, columns, None) - - except ValueError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - elif orient == "index": - if columns is None: - raise TypeError("Must pass columns with orient='index'") - - keys = ensure_index(keys) - - # GH 17312 - # Provide more informative error msg when scalar values passed - try: - arr = np.array(values, dtype=object).T - data = [lib.maybe_convert_objects(v) for v in arr] - return cls._from_arrays(data, columns, keys) - - except TypeError: - if not is_nested_list_like(values): - raise ValueError( - "The value in each (key, value) pair " - "must be an array, Series, or dict" - ) - - else: # pragma: no cover - raise ValueError("'orient' must be either 'columns' or 'index'") - - @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None): + def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, - fname, + path, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -1996,11 +1869,16 @@ def to_stata( Parameters ---------- - fname : str, buffer or path object + path : str, buffer or path object String, path object (pathlib.Path or py._path.local.LocalPath) or object implementing a binary write() function. If using a buffer then the buffer will not be automatically closed after the file data has been written. + + .. versionchanged:: 1.0.0 + + Previously this was "fname" + convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', @@ -2010,8 +1888,6 @@ def to_stata( a datetime column has timezone information. write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Unicode is not supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder`. time_stamp : datetime @@ -2064,19 +1940,22 @@ def to_stata( >>> df.to_stata('animals.dta') # doctest: +SKIP """ kwargs = {} - if version not in (114, 117): - raise ValueError("Only formats 114 and 117 supported.") + if version not in (114, 117, 118): + raise ValueError("Only formats 114, 117 and 118 are supported.") if version == 114: if convert_strl is not None: - raise ValueError("strl support is only available when using format 117") + raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter else: - from pandas.io.stata import StataWriter117 as statawriter + if version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: + from pandas.io.stata import StataWriter118 as statawriter kwargs["convert_strl"] = convert_strl writer = statawriter( - fname, + path, self, convert_dates=convert_dates, byteorder=byteorder, @@ -2084,32 +1963,64 @@ def to_stata( data_label=data_label, write_index=write_index, variable_labels=variable_labels, - **kwargs + **kwargs, ) writer.write_file() - def to_feather(self, fname): + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") + def to_feather(self, path) -> None: """ Write out the binary feather-format for DataFrames. Parameters ---------- - fname : str + path : str String file path. """ from pandas.io.feather_format import to_feather - to_feather(self, fname) + to_feather(self, path) + + @Appender( + """ + Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + """ + ) + @Substitution(klass="DataFrame") + @Appender(_shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + assert buf is not None # Help mypy. + buf.writelines(result) + return None + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - fname, + path, engine="auto", compression="snappy", index=None, partition_cols=None, - **kwargs - ): + **kwargs, + ) -> None: """ Write a DataFrame to the binary parquet format. @@ -2122,11 +2033,13 @@ def to_parquet( Parameters ---------- - fname : str + path : str File path or Root Directory path. Will be used as Root Directory path while writing a partitioned dataset. - .. versionchanged:: 0.24.0 + .. versionchanged:: 1.0.0 + + Previously this was "fname" engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option @@ -2183,12 +2096,12 @@ def to_parquet( to_parquet( self, - fname, + path, engine, compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) @Substitution( @@ -2242,9 +2155,10 @@ def to_html( A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. encoding : str, default "utf-8" - Set character encoding + Set character encoding. .. versionadded:: 1.0 + table_id : str, optional A css id is included in the opening `
` tag if specified. @@ -2297,7 +2211,7 @@ def to_html( def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): + ) -> None: """ Print a concise summary of a DataFrame. @@ -2370,9 +2284,11 @@ def info( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - int_col 5 non-null int64 - text_col 5 non-null object - float_col 5 non-null float64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2411,9 +2327,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2421,9 +2339,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB """ @@ -2437,11 +2357,12 @@ def info( lines.append(self.index._summary()) if len(self.columns) == 0: - lines.append("Empty {name}".format(name=type(self).__name__)) + lines.append(f"Empty {type(self).__name__}") fmt.buffer_put_lines(buf, lines) return cols = self.columns + col_count = len(self.columns) # hack if max_cols is None: @@ -2450,39 +2371,76 @@ def info( max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) + show_counts = (col_count <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts - exceeds_info_cols = len(self.columns) > max_cols + exceeds_info_cols = col_count > max_cols def _verbose_repr(): - lines.append("Data columns (total %d columns):" % len(self.columns)) - space = max(len(pprint_thing(k)) for k in self.columns) + 4 + lines.append(f"Data columns (total {len(self.columns)} columns):") + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space counts = None - tmpl = "{count}{dtype}" + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( - "Columns must equal counts " - "({cols:d} != {counts:d})".format( - cols=len(cols), counts=len(counts) - ) + f"Columns must equal counts ({len(cols)} != {len(counts)})" ) - tmpl = "{count} non-null {dtype}" + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - dtypes = self.dtypes for i, col in enumerate(self.columns): - dtype = dtypes.iloc[i] + dtype = self.dtypes.iloc[i] col = pprint_thing(col) + line_no = _put_str(" {num}".format(num=i), space_num) count = "" if show_counts: count = counts.iloc[i] lines.append( - _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): @@ -2492,13 +2450,9 @@ def _sizeof_fmt(num, size_qualifier): # returns size in human readable format for x in ["bytes", "KB", "MB", "GB", "TB"]: if num < 1024.0: - return "{num:3.1f}{size_q} {x}".format( - num=num, size_q=size_qualifier, x=x - ) + return f"{num:3.1f}{size_qualifier} {x}" num /= 1024.0 - return "{num:3.1f}{size_q} {pb}".format( - num=num, size_q=size_qualifier, pb="PB" - ) + return f"{num:3.1f}{size_qualifier} PB" if verbose: _verbose_repr() @@ -2511,8 +2465,8 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() counts = self._data.get_dtype_counts() - dtypes = ["{k}({kk:d})".format(k=k[0], kk=k[1]) for k in sorted(counts.items())] - lines.append("dtypes: {types}".format(types=", ".join(dtypes))) + dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(dtypes)}") if memory_usage is None: memory_usage = get_option("display.memory_usage") @@ -2529,15 +2483,10 @@ def _sizeof_fmt(num, size_qualifier): if "object" in counts or self.index._is_memory_usage_qualified(): size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append( - "memory usage: {mem}\n".format( - mem=_sizeof_fmt(mem_usage, size_qualifier) - ) - ) - + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines) - def memory_usage(self, index=True, deep=False): + def memory_usage(self, index=True, deep=False) -> Series: """ Return the memory usage of each column in bytes. @@ -2631,7 +2580,7 @@ def memory_usage(self, index=True, deep=False): ) return result - def transpose(self, *args, **kwargs): + def transpose(self, *args, copy: bool = False) -> "DataFrame": """ Transpose index and columns. @@ -2641,12 +2590,14 @@ def transpose(self, *args, **kwargs): Parameters ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. copy : bool, default False - If True, the underlying data is copied. Otherwise (default), no - copy is made if possible. - *args, **kwargs - Additional keywords have no effect but might be accepted for - compatibility with numpy. + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. Returns ------- @@ -2727,41 +2678,31 @@ def transpose(self, *args, **kwargs): dtype: object """ nv.validate_transpose(args, dict()) - return super().transpose(1, 0, **kwargs) + # construct the args - T = property(transpose) + dtypes = list(self.dtypes) + if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): + # We have EAs with the same dtype. We can preserve that dtype in transpose. + dtype = dtypes[0] + arr_type = dtype.construct_array_type() + values = self.values - # ---------------------------------------------------------------------- - # Picklability + new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + result = self._constructor( + dict(zip(self.index, new_values)), index=self.columns + ) - # legacy pickle formats - def _unpickle_frame_compat(self, state): # pragma: no cover - if len(state) == 2: # pragma: no cover - series, idx = state - columns = sorted(series) else: - series, cols, idx = state - columns = com._unpickle_array(cols) - - index = com._unpickle_array(idx) - self._data = self._init_dict(series, index, columns, None) - - def _unpickle_matrix_compat(self, state): # pragma: no cover - # old unpickling - (vals, idx, cols), object_state = state - - index = com._unpickle_array(idx) - dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), copy=False) - - if object_state is not None: - ovals, _, ocols = object_state - objects = DataFrame( - ovals, index=index, columns=com._unpickle_array(ocols), copy=False + new_values = self.values.T + if copy: + new_values = new_values.copy() + result = self._constructor( + new_values, index=self.columns, columns=self.index ) - dm = dm.join(objects) + return result.__finalize__(self) - self._data = dm._data + T = property(transpose) # ---------------------------------------------------------------------- # Indexing Methods @@ -2779,7 +2720,6 @@ def _ixs(self, i: int, axis: int = 0): """ # irow if axis == 0: - label = self.index[i] new_values = self._data.fast_xs(i) # if we are a copy, mark as such @@ -2883,7 +2823,7 @@ def _getitem_bool_array(self, key): ) elif len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d." % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}." ) # check_bool_indexer will throw exception if Series key cannot @@ -2994,7 +2934,7 @@ def _setitem_array(self, key, value): if com.is_bool_indexer(key): if len(key) != len(self.index): raise ValueError( - "Item wrong length %d instead of %d!" % (len(key), len(self.index)) + f"Item wrong length {len(key)} instead of {len(self.index)}!" ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] @@ -3132,18 +3072,27 @@ def query(self, expr, inplace=False, **kwargs): Parameters ---------- expr : str - The query string to evaluate. You can refer to variables + The query string to evaluate. + + You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. - .. versionadded:: 0.25.0 - - You can refer to column names that contain spaces by surrounding - them in backticks. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3198,6 +3147,32 @@ def query(self, expr, inplace=False, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), @@ -3234,8 +3209,8 @@ def query(self, expr, inplace=False, **kwargs): """ inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): - msg = "expr must be a string to be evaluated, {0} given" - raise ValueError(msg.format(type(expr))) + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None res = self.eval(expr, **kwargs) @@ -3268,7 +3243,7 @@ def eval(self, expr, inplace=False, **kwargs): If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, a new DataFrame is returned. - kwargs : dict + **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by :meth:`~pandas.DataFrame.query`. @@ -3347,14 +3322,15 @@ def eval(self, expr, inplace=False, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_space_character_free_column_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None): + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3452,11 +3428,7 @@ def select_dtypes(self, include=None, exclude=None): # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError( - "include and exclude overlap on {inc_ex}".format( - inc_ex=(include & exclude) - ) - ) + raise ValueError(f"include and exclude overlap on {(include & exclude)}") # We raise when both include and exclude are empty # Hence, we can just shrink the columns we want to keep @@ -3488,7 +3460,7 @@ def extract_unique_dtypes_from_dtypes_set( return self.iloc[:, keep_these.values] - def insert(self, loc, column, value, allow_duplicates=False): + def insert(self, loc, column, value, allow_duplicates=False) -> None: """ Insert column into DataFrame at specified location. @@ -3508,7 +3480,7 @@ def insert(self, loc, column, value, allow_duplicates=False): value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs): + def assign(self, **kwargs) -> "DataFrame": r""" Assign new columns to a DataFrame. @@ -3534,16 +3506,12 @@ def assign(self, **kwargs): Notes ----- Assigning multiple columns within the same ``assign`` is possible. - For Python 3.6 and above, later items in '\*\*kwargs' may refer to - newly created or modified columns in 'df'; items are computed and - assigned into 'df' in order. For Python 3.5 and below, the order of - keyword arguments is not specified, you cannot refer to newly created - or modified columns. All items are computed first, and then assigned - in alphabetical order. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. .. versionchanged:: 0.23.0 - Keyword argument order is maintained for Python 3.6 and later. + Keyword argument order is maintained. Examples -------- @@ -3569,9 +3537,8 @@ def assign(self, **kwargs): Portland 17.0 62.6 Berkeley 25.0 77.0 - In Python 3.6+, you can create multiple columns within the same assign - where one of the columns depends on another one defined within the same - assign: + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) @@ -3581,21 +3548,8 @@ def assign(self, **kwargs): """ data = self.copy() - # >= 3.6 preserve order of kwargs - if PY36: - for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) - else: - # <= 3.5: do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com.apply_if_callable(v, data) - - # <= 3.5 and earlier - results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) return data def _sanitize_column(self, key, value, broadcast=True): @@ -3690,7 +3644,7 @@ def reindexer(value): value = maybe_cast_to_datetime(value, infer_dtype) # return internal types directly - if is_extension_type(value) or is_extension_array_dtype(value): + if is_extension_array_dtype(value): return value # broadcast across multiple columns if necessary @@ -3709,7 +3663,7 @@ def _series(self): for idx, item in enumerate(self.columns) } - def lookup(self, row_labels, col_labels): + def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. @@ -3727,13 +3681,6 @@ def lookup(self, row_labels, col_labels): ------- numpy.ndarray - Notes - ----- - Akin to:: - - result = [df.get_value(row, col) - for row, col in zip(row_labels, col_labels)] - Examples -------- values : ndarray @@ -3824,7 +3771,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value): + def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": """ We are guaranteed non-Nones in the axes. """ @@ -3858,7 +3805,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ): + ) -> "DataFrame": return super().align( other, join=join, @@ -3885,13 +3832,13 @@ def align( ("tolerance", None), ], ) - def reindex(self, *args, **kwargs): + def reindex(self, *args, **kwargs) -> "DataFrame": axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) - return super().reindex(**kwargs) + return self._ensure_type(super().reindex(**kwargs)) def drop( self, @@ -4039,7 +3986,19 @@ def drop( "mapper", [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], ) - def rename(self, *args, **kwargs): + def rename( + self, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional["DataFrame"]: + """ Alter axes labels. @@ -4148,12 +4107,16 @@ def rename(self, *args, **kwargs): 2 2 5 4 3 6 """ - axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename") - kwargs.update(axes) - # Pop these, since the values are in `kwargs` under different names - kwargs.pop("axis", None) - kwargs.pop("mapper", None) - return super().rename(**kwargs) + return super().rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.fillna.__doc__) @@ -4165,8 +4128,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs - ): + ) -> Optional["DataFrame"]: return super().fillna( value=value, method=method, @@ -4174,7 +4136,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -4197,9 +4158,9 @@ def replace( ) @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + return self._ensure_type( + super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) ) def set_index( @@ -4304,7 +4265,7 @@ def set_index( "one-dimensional arrays." ) - missing = [] + missing: List[Optional[Hashable]] = [] for col in keys: if isinstance( col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) @@ -4318,15 +4279,13 @@ def set_index( try: found = col in self.columns except TypeError: - raise TypeError( - err_msg + " Received column of type {}".format(type(col)) - ) + raise TypeError(f"{err_msg}. Received column of type {type(col)}") else: if not found: missing.append(col) if missing: - raise KeyError("None of {} are in the columns".format(missing)) + raise KeyError(f"None of {missing} are in the columns") if inplace: frame = self @@ -4336,14 +4295,14 @@ def set_index( arrays = [] names = [] if append: - names = [x for x in self.index.names] + names = list(self.index.names) if isinstance(self.index, ABCMultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: arrays.append(self.index) - to_remove = [] + to_remove: List[Optional[Hashable]] = [] for col in keys: if isinstance(col, ABCMultiIndex): for n in range(col.nlevels): @@ -4370,17 +4329,15 @@ def set_index( # check newest element against length of calling frame, since # ensure_index_from_sequences would not raise for append=False. raise ValueError( - "Length mismatch: Expected {len_self} rows, " - "received array of length {len_col}".format( - len_self=len(self), len_col=len(arrays[-1]) - ) + f"Length mismatch: Expected {len(self)} rows, " + f"received array of length {len(arrays[-1])}" ) index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index[index.duplicated()].unique() - raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates)) + raise ValueError(f"Index has duplicate keys: {duplicates}") # use set to handle duplicate column names gracefully in case of drop for c in set(to_remove): @@ -4395,8 +4352,13 @@ def set_index( return frame def reset_index( - self, level=None, drop=False, inplace=False, col_level=0, col_fill="" - ): + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = None, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Optional[Hashable] = "", + ) -> Optional["DataFrame"]: """ Reset the index, or a level of it. @@ -4424,8 +4386,8 @@ def reset_index( Returns ------- - DataFrame - DataFrame with the new index. + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. See Also -------- @@ -4574,7 +4536,7 @@ def _maybe_casted_values(index, labels=None): values = values._data if mask.any(): - values, changed = maybe_upcast_putmask(values, mask, np.nan) + values, _ = maybe_upcast_putmask(values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): values = values_type(values, dtype=values_dtype) @@ -4590,10 +4552,11 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: + to_insert: Iterable[Tuple[Any, Optional[Any]]] if isinstance(self.index, ABCMultiIndex): names = [ - n if n is not None else ("level_%d" % i) - for (i, n) in enumerate(self.index.names) + (n if n is not None else f"level_{i}") + for i, n in enumerate(self.index.names) ] to_insert = zip(self.index.levels, self.index.codes) else: @@ -4612,8 +4575,7 @@ def _maybe_casted_values(index, labels=None): if len(col_name) not in (1, self.columns.nlevels): raise ValueError( "col_fill=None is incompatible " - "with incomplete column name " - "{}".format(name) + f"with incomplete column name {name}" ) col_fill = col_name[0] @@ -4630,23 +4592,25 @@ def _maybe_casted_values(index, labels=None): if not inplace: return new_obj + return None + # ---------------------------------------------------------------------- # Reindex-based selection methods @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "DataFrame": return super().isna() @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "DataFrame": return super().isnull() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "DataFrame": return super().notna() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "DataFrame": return super().notnull() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -4665,7 +4629,7 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): * 0, or 'index' : Drop rows which contain missing values. * 1, or 'columns' : Drop columns which contain missing value. - .. deprecated:: 0.23.0 + .. versionchanged:: 1.0.0 Pass tuple or list to drop on multiple axes. Only a single axis is allowed. @@ -4755,50 +4719,48 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(axis, (tuple, list)): # GH20987 - msg = ( - "supplying multiple axes to axis is deprecated and " - "will be removed in a future version." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) + raise TypeError("supplying multiple axes to axis is no longer supported.") - result = self - for ax in axis: - result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax) + axis = self._get_axis_number(axis) + agg_axis = 1 - axis + + agg_obj = self + if subset is not None: + ax = self._get_axis(agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + agg_obj = self.take(indices, axis=agg_axis) + + count = agg_obj.count(axis=agg_axis) + + if thresh is not None: + mask = count >= thresh + elif how == "any": + mask = count == len(agg_obj._get_axis(agg_axis)) + elif how == "all": + mask = count > 0 else: - axis = self._get_axis_number(axis) - agg_axis = 1 - axis - - agg_obj = self - if subset is not None: - ax = self._get_axis(agg_axis) - indices = ax.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - agg_obj = self.take(indices, axis=agg_axis) - - count = agg_obj.count(axis=agg_axis) - - if thresh is not None: - mask = count >= thresh - elif how == "any": - mask = count == len(agg_obj._get_axis(agg_axis)) - elif how == "all": - mask = count > 0 + if how is not None: + raise ValueError(f"invalid how option: {how}") else: - if how is not None: - raise ValueError("invalid how option: {h}".format(h=how)) - else: - raise TypeError("must specify how or thresh") + raise TypeError("must specify how or thresh") - result = self.loc(axis=axis)[mask] + result = self.loc(axis=axis)[mask] if inplace: self._update_inplace(result) else: return result - def drop_duplicates(self, subset=None, keep="first", inplace=False): + def drop_duplicates( + self, + subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + keep: Union[str, bool] = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -4817,10 +4779,15 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- DataFrame + DataFrame with duplicates removed or None if ``inplace=True``. """ if self.empty: return self.copy() @@ -4829,13 +4796,26 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): duplicated = self.duplicated(subset, keep=keep) if inplace: - inds, = (-duplicated)._ndarray_values.nonzero() + (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: - return self[-duplicated] + result = self[-duplicated] + + if ignore_index: + result.index = ibase.default_index(len(result)) + return result - def duplicated(self, subset=None, keep="first"): + return None + + def duplicated( + self, + subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, + keep: Union[str, bool] = "first", + ) -> "Series": """ Return boolean Series denoting duplicate rows. @@ -4879,6 +4859,9 @@ def f(vals): ): subset = (subset,) + # needed for mypy since can't narrow types using np.iterable + subset = cast(Iterable, subset) + # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a # key that doesn't exist. @@ -4905,6 +4888,7 @@ def sort_values( inplace=False, kind="quicksort", na_position="last", + ignore_index=False, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) @@ -4913,8 +4897,7 @@ def sort_values( by = [by] if is_sequence(ascending) and len(by) != len(ascending): raise ValueError( - "Length of ascending (%d) != length of by (%d)" - % (len(ascending), len(by)) + f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: from pandas.core.sorting import lexsort_indexer @@ -4939,6 +4922,9 @@ def sort_values( indexer, axis=self._get_block_manager_axis(axis), verify=False ) + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + if inplace: return self._update_inplace(new_data) else: @@ -4955,24 +4941,13 @@ def sort_index( kind="quicksort", na_position="last", sort_remaining=True, - by=None, + ignore_index: bool = False, ): # TODO: this can be combined with Series.sort_index impl as # almost identical inplace = validate_bool_kwarg(inplace, "inplace") - # 10726 - if by is not None: - warnings.warn( - "by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, - stacklevel=2, - ) - if level is not None: - raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -5017,12 +4992,15 @@ def sort_index( # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - def nlargest(self, n, columns, keep="first"): + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. @@ -5131,7 +5109,7 @@ def nlargest(self, n, columns, keep="first"): """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first"): + def nsmallest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5232,14 +5210,14 @@ def nsmallest(self, n, columns, keep="first"): self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0): + def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": """ Swap levels i and j in a MultiIndex on a particular axis. Parameters ---------- - i, j : int, str (can be mixed) - Level of index to be swapped. Can pass level name as string. + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. Returns ------- @@ -5254,7 +5232,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0): + def reorder_levels(self, order, axis=0) -> "DataFrame": """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5268,7 +5246,7 @@ def reorder_levels(self, order, axis=0): Returns ------- - type of caller (new object) + DataFrame """ axis = self._get_axis_number(axis) if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover @@ -5286,7 +5264,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join="outer", level=level, copy=False) + # at this point we have `self._indexed_same(other)` if fill_value is None: # since _arith_op may be called in a loop, avoid function call @@ -5302,14 +5280,15 @@ def _arith_op(left, right): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - if ops.should_series_dispatch(this, other, func): + if ops.should_series_dispatch(self, other, func): # iterate over columns - new_data = ops.dispatch_to_series(this, other, _arith_op) + new_data = ops.dispatch_to_series(self, other, _arith_op) else: with np.errstate(all="ignore"): - res_values = _arith_op(this.values, other.values) - new_data = dispatch_fill_zeros(func, this.values, other.values, res_values) - return this._construct_result(new_data) + res_values = _arith_op(self.values, other.values) + new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) + + return new_data def _combine_match_index(self, other, func): # at this point we have `self.index.equals(other.index)` @@ -5341,7 +5320,9 @@ def _construct_result(self, result) -> "DataFrame": out.columns = self.columns return out - def combine(self, other, func, fill_value=None, overwrite=True): + def combine( + self, other: "DataFrame", func, fill_value=None, overwrite=True + ) -> "DataFrame": """ Perform column-wise combine with another DataFrame. @@ -5508,7 +5489,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other): + def combine_first(self, other: "DataFrame") -> "DataFrame": """ Update null elements with value in the same location in `other`. @@ -5584,14 +5565,9 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg( - old_arg_name="raise_conflict", - new_arg_name="errors", - mapping={False: "ignore", True: "raise"}, - ) def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" - ): + ) -> None: """ Modify in place using non-NA values from another DataFrame. @@ -5742,6 +5718,82 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" @@ -5850,7 +5902,7 @@ def update( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None): + def pivot(self, index=None, columns=None, values=None) -> "DataFrame": from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -5881,13 +5933,13 @@ def pivot(self, index=None, columns=None, values=None): hierarchical columns whose top level are the function names (inferred from the function objects themselves) If dict is passed, the key is column to aggregate and value - is function or list of functions + is function or list of functions. fill_value : scalar, default None - Value to replace missing values with + Value to replace missing values with. margins : bool, default False - Add all row / columns (e.g. for subtotal / grand totals) + Add all row / columns (e.g. for subtotal / grand totals). dropna : bool, default True - Do not include columns whose entries are all NaN + Do not include columns whose entries are all NaN. margins_name : str, default 'All' Name of the row / column that will contain the totals when margins is True. @@ -5901,6 +5953,7 @@ def pivot(self, index=None, columns=None, values=None): Returns ------- DataFrame + An Excel style pivot table. See Also -------- @@ -5996,7 +6049,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ): + ) -> "DataFrame": from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -6245,6 +6298,8 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": raise ValueError("columns must be unique") df = self.reset_index(drop=True) + # TODO: use overload to refine return type of reset_index + assert df is not None # needed for mypy result = df[column].explode() result = df.drop([column], axis=1).join(result) result.index = self.index.take(result.index) @@ -6328,7 +6383,6 @@ def unstack(self, level=-1, fill_value=None): %(versionadded)s Parameters ---------- - frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional @@ -6425,7 +6479,7 @@ def melt( var_name=None, value_name="value", col_level=None, - ): + ) -> "DataFrame": from pandas.core.reshape.melt import melt return melt( @@ -6440,7 +6494,7 @@ def melt( # ---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1, axis=0): + def diff(self, periods=1, axis=0) -> "DataFrame": """ First discrete difference of element. @@ -6621,7 +6675,7 @@ def _gotitem( see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): @@ -6648,15 +6702,13 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate @Appender(_shared_docs["transform"] % _shared_doc_kwargs) - def transform(self, func, axis=0, *args, **kwargs): + def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T return super().transform(func, *args, **kwargs) - def apply( - self, func, axis=0, raw=False, reduce=None, result_type=None, args=(), **kwds - ): + def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ Apply a function along an axis of the DataFrame. @@ -6805,7 +6857,7 @@ def apply( ) return op.get_result() - def applymap(self, func): + def applymap(self, func) -> "DataFrame": """ Apply a function to a Dataframe elementwise. @@ -6874,7 +6926,9 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False, sort=None): + def append( + self, other, ignore_index=False, verify_integrity=False, sort=False + ) -> "DataFrame": """ Append rows of `other` to the end of caller, returning a new object. @@ -6888,14 +6942,13 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): If True, do not use the index labels. verify_integrity : bool, default False If True, raise ValueError on creating index with duplicates. - sort : bool, default None + sort : bool, default False Sort columns if the columns of `self` and `other` are not aligned. - The default sorting is deprecated and will change to not-sorting - in a future version of pandas. Explicitly pass ``sort=True`` to - silence the warning and sort. Explicitly pass ``sort=False`` to - silence the warning and not sort. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Changed to not sort by default. Returns ------- @@ -6973,40 +7026,36 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): other = Series(other) if other.name is None and not ignore_index: raise TypeError( - "Can only append a Series if ignore_index=True" - " or if the Series has a name" + "Can only append a Series if ignore_index=True " + "or if the Series has a name" ) - if other.name is None: - index = None - else: - # other must have the same index name as self, otherwise - # index name will be reset - index = Index([other.name], name=self.index.name) - + index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) try: combined_columns = self.columns.append(idx_diff) except TypeError: combined_columns = self.columns.astype(object).append(idx_diff) - other = other.reindex(combined_columns, copy=False) - other = DataFrame( - other.values.reshape((1, len(other))), - index=index, - columns=combined_columns, + other = ( + other.reindex(combined_columns, copy=False) + .to_frame() + .T.infer_objects() + .rename_axis(index.names, copy=False) ) - other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) - elif isinstance(other, list) and not isinstance(other[0], DataFrame): - other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat if isinstance(other, (list, tuple)): - to_concat = [self] + other + to_concat = [self, *other] else: to_concat = [self, other] return concat( @@ -7016,7 +7065,9 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): sort=sort, ) - def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): + def join( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ) -> "DataFrame": """ Join columns of another DataFrame. @@ -7207,7 +7258,7 @@ def merge( copy=True, indicator=False, validate=None, - ): + ) -> "DataFrame": from pandas.core.reshape.merge import merge return merge( @@ -7226,7 +7277,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "DataFrame": """ Round a DataFrame to a variable number of decimal places. @@ -7323,7 +7374,7 @@ def _series_round(s, decimals): if isinstance(decimals, Series): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") - new_cols = [col for col in _dict_round(self, decimals)] + new_cols = list(_dict_round(self, decimals)) elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.items()] @@ -7340,7 +7391,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1): + def corr(self, method="pearson", min_periods=1) -> "DataFrame": """ Compute pairwise correlation of columns, excluding NA/null values. @@ -7423,12 +7474,12 @@ def corr(self, method="pearson", min_periods=1): raise ValueError( "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method) + f"'{method}' was supplied" ) return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods=None): + def cov(self, min_periods=None) -> "DataFrame": """ Compute pairwise covariance of columns, excluding NA/null values. @@ -7538,7 +7589,7 @@ def cov(self, min_periods=None): return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False, method="pearson"): + def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ Compute pairwise correlation. @@ -7614,9 +7665,9 @@ def c(x): else: raise ValueError( - "Invalid method {method} was passed, " + f"Invalid method {method} was passed, " "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable".format(method=method) + "'spearman', or callable" ) if not drop: @@ -7746,8 +7797,7 @@ def _count_level(self, level, axis=0, numeric_only=False): if not isinstance(count_axis, ABCMultiIndex): raise TypeError( - "Can only count levels on hierarchical " - "{ax}.".format(ax=self._get_axis_name(axis)) + f"Can only count levels on hierarchical {self._get_axis_name(axis)}." ) if frame._is_mixed_type: @@ -7794,6 +7844,43 @@ def _reduce( def f(x): return op(x, axis=axis, skipna=skipna, **kwds) + def _get_data(axis_matters): + if filter_type is None or filter_type == "numeric": + data = self._get_numeric_data() + elif filter_type == "bool": + if axis_matters: + # GH#25101, GH#24434 + data = self._get_bool_data() if axis == 0 else self + else: + data = self._get_bool_data() + else: # pragma: no cover + msg = ( + f"Generating numeric_only data with filter_type {filter_type} " + "not supported." + ) + raise NotImplementedError(msg) + return data + + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + if numeric_only is None: values = self.values try: @@ -7804,7 +7891,7 @@ def f(x): # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. result = np.bool_(result) - except TypeError as err: + except TypeError: # e.g. in nanops trying to convert strs to float # try by-column first @@ -7827,31 +7914,15 @@ def f(x): result = result.iloc[0] return result - if filter_type is None or filter_type == "numeric": - data = self._get_numeric_data() - elif filter_type == "bool": - data = self._get_bool_data() - else: # pragma: no cover - raise NotImplementedError( - "Handling exception with filter_type {f} not" - "implemented.".format(f=filter_type) - ) from err + # TODO: why doesnt axis matter here? + data = _get_data(axis_matters=False) with np.errstate(all="ignore"): result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: - if filter_type is None or filter_type == "numeric": - data = self._get_numeric_data() - elif filter_type == "bool": - # GH 25101, # GH 24434 - data = self._get_bool_data() if axis == 0 else self - else: # pragma: no cover - msg = ( - "Generating numeric_only data with filter_type {f}" - "not supported.".format(f=filter_type) - ) - raise NotImplementedError(msg) + data = _get_data(axis_matters=True) + values = data.values labels = data._get_agg_axis(axis) else: @@ -7874,7 +7945,7 @@ def f(x): result = Series(result, index=labels) return result - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. @@ -7914,7 +7985,7 @@ def nunique(self, axis=0, dropna=True): """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - def idxmin(self, axis=0, skipna=True): + def idxmin(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of minimum over requested axis. @@ -7923,7 +7994,7 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7952,7 +8023,7 @@ def idxmin(self, axis=0, skipna=True): result = [index[i] if i >= 0 else np.nan for i in indices] return Series(result, index=self._get_agg_axis(axis)) - def idxmax(self, axis=0, skipna=True): + def idxmax(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of maximum over requested axis. @@ -7961,7 +8032,7 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7999,9 +8070,9 @@ def _get_agg_axis(self, axis_num): elif axis_num == 1: return self.index else: - raise ValueError("Axis must be 0 or 1 (got %r)" % axis_num) + raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True): + def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": """ Get the mode(s) of each element along the selected axis. @@ -8168,7 +8239,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): cols = Index([], name=self.columns.name) if is_list_like(q): return self._constructor([], index=q, columns=cols) - return self._constructor_sliced([], index=cols, name=q) + return self._constructor_sliced([], index=cols, name=q, dtype=np.float64) result = data._data.quantile( qs=q, axis=1, interpolation=interpolation, transposed=is_transposed @@ -8184,7 +8255,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): return result - def to_timestamp(self, freq=None, how="start", axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -8214,11 +8285,11 @@ def to_timestamp(self, freq=None, how="start", axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) else: # pragma: no cover - raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) + raise AssertionError(f"Axis must be 0 or 1. Got {axis}") return self._constructor(new_data) - def to_period(self, freq=None, axis=0, copy=True): + def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -8248,11 +8319,11 @@ def to_period(self, freq=None, axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_period(freq=freq)) else: # pragma: no cover - raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) + raise AssertionError(f"Axis must be 0 or 1. Got {axis}") return self._constructor(new_data) - def isin(self, values): + def isin(self, values) -> "DataFrame": """ Whether each element in the DataFrame is contained in values. @@ -8319,12 +8390,14 @@ def isin(self, values): from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) - return concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, + return self._ensure_type( + concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) ) elif isinstance(values, Series): if not values.index.is_unique: @@ -8337,10 +8410,9 @@ def isin(self, values): else: if not is_list_like(values): raise TypeError( - "only list-like or dict-like objects are " - "allowed to be passed to DataFrame.isin(), " - "you passed a " - "{0!r}".format(type(values).__name__) + "only list-like or dict-like objects are allowed " + "to be passed to DataFrame.isin(), " + f"you passed a {repr(type(values).__name__)}" ) return DataFrame( algorithms.isin(self.values.ravel(), values).reshape(self.shape), @@ -8358,10 +8430,6 @@ def isin(self, values): DataFrame._setup_axes( ["index", "columns"], - info_axis=1, - stat_axis=0, - axes_are_reversed=True, - aliases={"rows": 0}, docs={ "index": "The index (row labels) of the DataFrame.", "columns": "The column labels of the DataFrame.", @@ -8376,13 +8444,13 @@ def isin(self, values): def _from_nested_dict(data): # TODO: this should be seriously cythonized - new_data = OrderedDict() + new_data = {} for index, s in data.items(): for col, v in s.items(): - new_data[col] = new_data.get(col, OrderedDict()) + new_data[col] = new_data.get(col, {}) new_data[col][index] = v return new_data def _put_str(s, space): - return "{s}".format(s=s)[:space].ljust(space) + return str(s)[:space].ljust(space) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f88c26c7bc782..0116207675889 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,7 +8,6 @@ import re from textwrap import dedent from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -19,6 +18,8 @@ Optional, Sequence, Set, + Tuple, + Type, Union, ) import warnings @@ -28,7 +29,16 @@ from pandas._config import config -from pandas._libs import Timestamp, iNaT, properties +from pandas._libs import Timestamp, iNaT, lib, properties +from pandas._typing import ( + Axis, + Dtype, + FilePathOrBuffer, + FrameOrSeries, + JSONSerializable, + Level, + Renamer, +) from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -67,12 +77,12 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype, FilePathOrBuffer, JSONSerializable from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.index import ( +from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.indexes.api import ( Index, InvalidIndexError, MultiIndex, @@ -83,6 +93,7 @@ from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing from pandas.core.internals import BlockManager +from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME from pandas.io.formats import format as fmt @@ -92,7 +103,7 @@ # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", @@ -103,10 +114,6 @@ Name or list of names to sort by""", ) -# sentinel value to use as kwarg in place of None when None has special meaning -# and needs to be distinguished from a user explicitly passing None. -sentinel = object() - def _single_replace(self, to_replace, method, inplace, limit): """ @@ -115,9 +122,8 @@ def _single_replace(self, to_replace, method, inplace, limit): """ if self.ndim != 1: raise TypeError( - "cannot replace {0} with method {1} on a {2}".format( - to_replace, method, type(self).__name__ - ) + f"cannot replace {to_replace} with method {method} on a " + f"{type(self).__name__}" ) orig_dtype = self.dtype @@ -142,7 +148,7 @@ def _single_replace(self, to_replace, method, inplace, limit): bool_t = bool # Need alias because NDFrame has def bool: -class NDFrame(PandasObject, SelectionMixin): +class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -154,7 +160,7 @@ class NDFrame(PandasObject, SelectionMixin): copy : bool, default False """ - _internal_names = [ + _internal_names: List[str] = [ "_data", "_cacher", "_item_cache", @@ -168,33 +174,15 @@ class NDFrame(PandasObject, SelectionMixin): "_metadata", "__array_struct__", "__array_interface__", - ] # type: List[str] - _internal_names_set = set(_internal_names) # type: Set[str] - _accessors = set() # type: Set[str] - _deprecations = frozenset( - [ - "as_blocks", - "as_matrix", - "blocks", - "clip_lower", - "clip_upper", - "get_dtype_counts", - "get_ftype_counts", - "get_values", - "is_copy", - "ftypes", - "ix", - ] - ) # type: FrozenSet[str] - _metadata = [] # type: List[str] + ] + _internal_names_set: Set[str] = set(_internal_names) + _accessors: Set[str] = set() + _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"]) + _metadata: List[str] = [] _is_copy = None - _data = None # type: BlockManager - - if TYPE_CHECKING: - # TODO(PY36): replace with _attrs : Dict[Hashable, Any] - # We need the TYPE_CHECKING, because _attrs is not a class attribute - # and Py35 doesn't support the new syntax. - _attrs = {} # type: Dict[Optional[Hashable], Any] + _data: BlockManager + _attrs: Dict[Optional[Hashable], Any] + _typ: str # ---------------------------------------------------------------------- # Constructors @@ -251,6 +239,10 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): def attrs(self) -> Dict[Optional[Hashable], Any]: """ Dictionary of global attributes on this object. + + .. warning:: + + attrs is experimental and may change without warning. """ if self._attrs is None: self._attrs = {} @@ -260,29 +252,6 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) - @property - def is_copy(self): - """ - Return the copy. - """ - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._is_copy - - @is_copy.setter - def is_copy(self, msg): - warnings.warn( - "Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", - FutureWarning, - stacklevel=2, - ) - self._is_copy = msg - def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -293,7 +262,7 @@ def _validate_dtype(self, dtype): if dtype.kind == "V": raise NotImplementedError( "compound dtypes are not implemented" - " in the {0} constructor".format(self.__class__.__name__) + f" in the {type(self).__name__} constructor" ) return dtype @@ -302,7 +271,7 @@ def _validate_dtype(self, dtype): # Construction @property - def _constructor(self): + def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: """Used when a manipulation result has the same dimensions as the original. """ @@ -324,71 +293,52 @@ def _constructor_expanddim(self): # ---------------------------------------------------------------------- # Axis + _AXIS_ALIASES = {"rows": 0} + _AXIS_IALIASES = {0: "rows"} + _stat_axis_number = 0 + _stat_axis_name = "index" + _ix = None + _AXIS_ORDERS: List[str] + _AXIS_NUMBERS: Dict[str, int] + _AXIS_NAMES: Dict[int, str] + _AXIS_REVERSED: bool + _info_axis_number: int + _info_axis_name: str + _AXIS_LEN: int @classmethod - def _setup_axes( - cls, - axes, - info_axis=None, - stat_axis=None, - aliases=None, - axes_are_reversed=False, - build_axes=True, - ns=None, - docs=None, - ): + def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None: """ Provide axes setup for the major PandasObjects. Parameters ---------- axes : the names of the axes in order (lowest to highest) - info_axis_num : the axis of the selector dimension (int) - stat_axis_num : the number of axis for the default stats (int) - aliases : other names for a single axis (dict) - axes_are_reversed : bool - Whether to treat passed axes as reversed (DataFrame). - build_axes : setup the axis properties (default True) + docs : docstrings for the axis properties """ + info_axis = len(axes) - 1 + axes_are_reversed = len(axes) > 1 cls._AXIS_ORDERS = axes cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)} cls._AXIS_LEN = len(axes) - cls._AXIS_ALIASES = aliases or dict() - cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()} cls._AXIS_NAMES = dict(enumerate(axes)) cls._AXIS_REVERSED = axes_are_reversed - # typ - setattr(cls, "_typ", cls.__name__.lower()) - - # indexing support - cls._ix = None - - if info_axis is not None: - cls._info_axis_number = info_axis - cls._info_axis_name = axes[info_axis] - - if stat_axis is not None: - cls._stat_axis_number = stat_axis - cls._stat_axis_name = axes[stat_axis] + cls._info_axis_number = info_axis + cls._info_axis_name = axes[info_axis] # setup the actual axis - if build_axes: - - def set_axis(a, i): - setattr(cls, a, properties.AxisProperty(i, docs.get(a, a))) - cls._internal_names_set.add(a) + def set_axis(a, i): + setattr(cls, a, properties.AxisProperty(i, docs.get(a, a))) + cls._internal_names_set.add(a) - if axes_are_reversed: - m = cls._AXIS_LEN - 1 - for i, a in cls._AXIS_NAMES.items(): - set_axis(a, m - i) - else: - for i, a in cls._AXIS_NAMES.items(): - set_axis(a, i) - - assert not isinstance(ns, dict) + if axes_are_reversed: + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, 1 - i) + else: + for i, a in cls._AXIS_NAMES.items(): + set_axis(a, i) def _construct_axes_dict(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" @@ -404,7 +354,7 @@ def _construct_axes_dict_from(self, axes, **kwargs): return d def _construct_axes_from_arguments( - self, args, kwargs, require_all=False, sentinel=None + self, args, kwargs, require_all: bool = False, sentinel=None ): """Construct and returns axes if supplied in args/kwargs. @@ -420,20 +370,6 @@ def _construct_axes_from_arguments( args = list(args) for a in self._AXIS_ORDERS: - # if we have an alias for this axis - alias = self._AXIS_IALIASES.get(a) - if alias is not None: - if a in kwargs: - if alias in kwargs: - raise TypeError( - "arguments are mutually exclusive " - "for [%s,%s]" % (a, alias) - ) - continue - if alias in kwargs: - kwargs[a] = kwargs.pop(alias) - continue - # look for a argument by position if a not in kwargs: try: @@ -446,7 +382,7 @@ def _construct_axes_from_arguments( return axes, kwargs @classmethod - def _from_axes(cls, data, axes, **kwargs): + def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries: # for construction from BlockManager if isinstance(data, BlockManager): return cls(data, **kwargs) @@ -468,7 +404,7 @@ def _get_axis_number(cls, axis): return cls._AXIS_NUMBERS[axis] except KeyError: pass - raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) + raise ValueError(f"No axis named {axis} for object type {cls}") @classmethod def _get_axis_name(cls, axis): @@ -481,7 +417,7 @@ def _get_axis_name(cls, axis): return cls._AXIS_NAMES[axis] except KeyError: pass - raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) + raise ValueError(f"No axis named {axis} for object type {cls}") def _get_axis(self, axis): name = self._get_axis_name(axis) @@ -496,7 +432,7 @@ def _get_block_manager_axis(cls, axis): return m - axis return axis - def _get_axis_resolvers(self, axis): + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -509,7 +445,7 @@ def _get_axis_resolvers(self, axis): # prefix with 'i' or 'c' depending on the input axis # e.g., you must do ilevel_0 for the 0th level of an unnamed # multiiindex - key = "{prefix}level_{i}".format(prefix=prefix, i=i) + key = f"{prefix}level_{i}" level = i level_values = axis_index.get_level_values(level) @@ -526,22 +462,31 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_index_resolvers(self): - d = {} + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) - return d - def _get_space_character_free_column_resolvers(self): - """Return the space character free column resolvers of a dataframe. + return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} - Column names with spaces are 'cleaned up' so that they can be referred - to by backtick quoting. + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. Used in :meth:`DataFrame.eval`. """ - from pandas.core.computation.common import _remove_spaces_column_name + from pandas.core.computation.parsing import clean_column_name - return {_remove_spaces_column_name(k): v for k, v in self.items()} + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} + + return { + clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) + } @property def _info_axis(self): @@ -552,14 +497,14 @@ def _stat_axis(self): return getattr(self, self._stat_axis_name) @property - def shape(self): + def shape(self) -> Tuple[int, ...]: """ Return a tuple of axis dimensions """ return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) @property - def axes(self): + def axes(self) -> List[Index]: """ Return index label(s) of the internal NDFrame """ @@ -568,7 +513,7 @@ def axes(self): return [self._get_axis(a) for a in self._AXIS_ORDERS] @property - def ndim(self): + def ndim(self) -> int: """ Return an int representing the number of axes / array dimensions. @@ -615,12 +560,12 @@ def size(self): return np.prod(self.shape) @property - def _selected_obj(self): + def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ return self @property - def _obj_with_exclusions(self): + def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ return self @@ -705,17 +650,6 @@ def set_axis(self, labels, axis=0, inplace=False): 1 2 5 2 3 6 """ - if is_scalar(labels): - warnings.warn( - 'set_axis now takes "labels" as first argument, and ' - '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and "labels" as second, is still supported ' - "but will be deprecated in a future version of pandas.", - FutureWarning, - stacklevel=2, - ) - labels, axis = axis, labels - if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -723,55 +657,11 @@ def set_axis(self, labels, axis=0, inplace=False): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis, labels): + def _set_axis(self, axis, labels) -> None: self._data.set_axis(axis, labels) self._clear_item_cache() - def transpose(self, *args, **kwargs): - """ - Permute the dimensions of the %(klass)s - - Parameters - ---------- - args : %(args_transpose)s - copy : bool, default False - Make a copy of the underlying data. Mixed-dtype data will - always result in a copy - **kwargs - Additional keyword arguments will be passed to the function. - - Returns - ------- - y : same as input - - Examples - -------- - >>> p.transpose(2, 0, 1) - >>> p.transpose(2, 0, 1, copy=True) - """ - - # construct the args - axes, kwargs = self._construct_axes_from_arguments( - args, kwargs, require_all=True - ) - axes_names = tuple(self._get_axis_name(axes[a]) for a in self._AXIS_ORDERS) - axes_numbers = tuple(self._get_axis_number(axes[a]) for a in self._AXIS_ORDERS) - - # we must have unique axes - if len(axes) != len(set(axes)): - raise ValueError("Must specify %s unique axes" % self._AXIS_LEN) - - new_axes = self._construct_axes_dict_from( - self, [self._get_axis(x) for x in axes_names] - ) - new_values = self.values.transpose(axes_numbers) - if kwargs.pop("copy", None) or (len(args) and args[-1]): - new_values = new_values.copy() - - nv.validate_transpose(tuple(), kwargs) - return self._constructor(new_values, **new_axes).__finalize__(self) - - def swapaxes(self, axis1, axis2, copy=True): + def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: """ Interchange axes and swap values axes appropriately. @@ -796,7 +686,7 @@ def swapaxes(self, axis1, axis2, copy=True): return self._constructor(new_values, *new_axes).__finalize__(self) - def droplevel(self, level, axis=0): + def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return DataFrame with requested index / column level(s) removed. @@ -813,7 +703,8 @@ def droplevel(self, level, axis=0): Returns ------- - DataFrame.droplevel() + DataFrame + DataFrame with requested index / column level(s) removed. Examples -------- @@ -855,7 +746,7 @@ def droplevel(self, level, axis=0): result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item): + def pop(self: FrameOrSeries, item) -> FrameOrSeries: """ Return item and drop from frame. Raise KeyError if not found. @@ -873,7 +764,7 @@ def pop(self, item): >>> df = pd.DataFrame([('falcon', 'bird', 389.0), ... ('parrot', 'bird', 24.0), ... ('lion', 'mammal', 80.5), - ... ('monkey','mammal', np.nan)], + ... ('monkey', 'mammal', np.nan)], ... columns=('name', 'class', 'max_speed')) >>> df name class max_speed @@ -1016,7 +907,7 @@ def squeeze(self, axis=None): ) ] - def swaplevel(self, i=-2, j=-1, axis=0): + def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries: """ Swap levels i and j in a MultiIndex on a particular axis @@ -1038,7 +929,18 @@ def swaplevel(self, i=-2, j=-1, axis=0): # ---------------------------------------------------------------------- # Rename - def rename(self, *args, **kwargs): + def rename( + self: FrameOrSeries, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional[FrameOrSeries]: """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left @@ -1151,46 +1053,48 @@ def rename(self, *args, **kwargs): See the :ref:`user guide ` for more. """ - axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - copy = kwargs.pop("copy", True) - inplace = kwargs.pop("inplace", False) - level = kwargs.pop("level", None) - axis = kwargs.pop("axis", None) - errors = kwargs.pop("errors", "ignore") - if axis is not None: - # Validate the axis - self._get_axis_number(axis) - - if kwargs: - raise TypeError( - "rename() got an unexpected keyword " - 'argument "{0}"'.format(list(kwargs.keys())[0]) - ) - - if com.count_not_none(*axes.values()) == 0: + if mapper is None and index is None and columns is None: raise TypeError("must pass an index to rename") - self._consolidate_inplace() + if index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + elif mapper is not None: + raise TypeError( + "Cannot specify both 'mapper' and any of 'index' or 'columns'" + ) + else: + # use the mapper argument + if axis and self._get_axis_number(axis) == 1: + columns = mapper + else: + index = mapper + result = self if inplace else self.copy(deep=copy) - # start in the axis order to eliminate too many copies - for axis in range(self._AXIS_LEN): - v = axes.get(self._AXIS_NAMES[axis]) - if v is None: + for axis_no, replacements in enumerate((index, columns)): + if replacements is None: continue - f = com.get_rename_function(v) - baxis = self._get_block_manager_axis(axis) + + ax = self._get_axis(axis_no) + baxis = self._get_block_manager_axis(axis_no) + f = com.get_rename_function(replacements) + if level is not None: - level = self.axes[axis]._get_level_number(level) + level = ax._get_level_number(level) # GH 13473 - if not callable(v): - indexer = self.axes[axis].get_indexer_for(v) + if not callable(replacements): + indexer = ax.get_indexer_for(replacements) if errors == "raise" and len(indexer[indexer == -1]): missing_labels = [ - label for index, label in enumerate(v) if indexer[index] == -1 + label + for index, label in enumerate(replacements) + if indexer[index] == -1 ] - raise KeyError("{} not found in axis".format(missing_labels)) + raise KeyError(f"{missing_labels} not found in axis") result._data = result._data.rename_axis( f, axis=baxis, copy=copy, level=level @@ -1199,11 +1103,12 @@ def rename(self, *args, **kwargs): if inplace: self._update_inplace(result._data) + return None else: return result.__finalize__(self) @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) - def rename_axis(self, mapper=sentinel, **kwargs): + def rename_axis(self, mapper=lib.no_default, **kwargs): """ Set the name of the axis for the index or columns. @@ -1328,7 +1233,7 @@ class name monkey 2 2 """ axes, kwargs = self._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel + (), kwargs, sentinel=lib.no_default ) copy = kwargs.pop("copy", True) inplace = kwargs.pop("inplace", False) @@ -1339,12 +1244,12 @@ class name if kwargs: raise TypeError( "rename_axis() got an unexpected keyword " - 'argument "{0}"'.format(list(kwargs.keys())[0]) + f'argument "{list(kwargs.keys())[0]}"' ) inplace = validate_bool_kwarg(inplace, "inplace") - if mapper is not sentinel: + if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( is_list_like(mapper) and not is_dict_like(mapper) @@ -1360,7 +1265,7 @@ class name for axis in range(self._AXIS_LEN): v = axes.get(self._AXIS_NAMES[axis]) - if v is sentinel: + if v is lib.no_default: continue non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: @@ -1437,7 +1342,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): # ---------------------------------------------------------------------- # Comparison Methods - def _indexed_same(self, other): + def _indexed_same(self, other) -> bool: return all( self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) @@ -1471,10 +1376,11 @@ def equals(self, other): DataFrame.eq : Compare two DataFrame objects of the same shape and return a DataFrame where each element is True if the respective element in each DataFrame is equal, False otherwise. - assert_series_equal : Return True if left and right Series are equal, - False otherwise. - assert_frame_equal : Return True if left and right DataFrames are - equal, False otherwise. + testing.assert_series_equal : Raises an AssertionError if left and + right are not equal. Provides an easy interface to ignore + inequality in dtypes, indexes and precision among others. + testing.assert_frame_equal : Like assert_series_equal, but targets + DataFrames. numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. @@ -1542,9 +1448,7 @@ def __neg__(self): ): arr = operator.neg(values) else: - raise TypeError( - "Unary negative expects numeric dtype, not {}".format(values.dtype) - ) + raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}") return self.__array_wrap__(arr) def __pos__(self): @@ -1558,9 +1462,7 @@ def __pos__(self): ): arr = operator.pos(values) else: - raise TypeError( - "Unary plus expects numeric dtype, not {}".format(values.dtype) - ) + raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}") return self.__array_wrap__(arr) def __invert__(self): @@ -1573,10 +1475,8 @@ def __invert__(self): def __nonzero__(self): raise ValueError( - "The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( - self.__class__.__name__ - ) + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ @@ -1600,15 +1500,15 @@ def bool(self): elif is_scalar(v): raise ValueError( "bool cannot act on a non-boolean single element " - "{0}".format(self.__class__.__name__) + f"{type(self).__name__}" ) self.__nonzero__() - def __abs__(self): + def __abs__(self: FrameOrSeries) -> FrameOrSeries: return self.abs() - def __round__(self, decimals=0): + def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: return self.round(decimals) # ------------------------------------------------------------------------- @@ -1649,7 +1549,7 @@ def _is_level_reference(self, key, axis=0): and not self._is_label_reference(key, axis=axis) ) - def _is_label_reference(self, key, axis=0): + def _is_label_reference(self, key, axis=0) -> bool_t: """ Test whether a key is a label reference for a given axis. @@ -1678,7 +1578,7 @@ def _is_label_reference(self, key, axis=0): and any(key in self.axes[ax] for ax in other_axes) ) - def _is_label_or_level_reference(self, key, axis=0): + def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: """ Test whether a key is a label or level reference for a given axis. @@ -1702,7 +1602,7 @@ def _is_label_or_level_reference(self, key, axis=0): key, axis=axis ) - def _check_label_or_level_ambiguity(self, key, axis=0): + def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: """ Check whether `key` is ambiguous. @@ -1740,18 +1640,12 @@ def _check_label_or_level_ambiguity(self, key, axis=0): ) msg = ( - "'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label, which is ambiguous." - ).format( - key=key, - level_article=level_article, - level_type=level_type, - label_article=label_article, - label_type=label_type, + f"'{key}' is both {level_article} {level_type} level and " + f"{label_article} {label_type} label, which is ambiguous." ) raise ValueError(msg) - def _get_label_or_level_values(self, key, axis=0): + def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1812,18 +1706,14 @@ def _get_label_or_level_values(self, key, axis=0): label_axis_name = "column" if axis == 0 else "index" raise ValueError( ( - "The {label_axis_name} label '{key}' " - "is not unique.{multi_message}" - ).format( - key=key, - label_axis_name=label_axis_name, - multi_message=multi_message, + f"The {label_axis_name} label '{key}' " + f"is not unique.{multi_message}" ) ) return values - def _drop_labels_or_levels(self, keys, axis=0): + def _drop_labels_or_levels(self, keys, axis: int = 0): """ Drop labels and/or levels for the given `axis`. @@ -1861,8 +1751,8 @@ def _drop_labels_or_levels(self, keys, axis=0): raise ValueError( ( "The following keys are not valid labels or " - "levels for axis {axis}: {invalid_keys}" - ).format(axis=axis, invalid_keys=invalid_keys) + f"levels for axis {axis}: {invalid_keys}" + ) ) # Compute levels and labels to drop @@ -1905,8 +1795,8 @@ def _drop_labels_or_levels(self, keys, axis=0): def __hash__(self): raise TypeError( - "{0!r} objects are mutable, thus they cannot be" - " hashed".format(self.__class__.__name__) + f"{repr(type(self).__name__)} objects are mutable, " + f"thus they cannot be hashed" ) def __iter__(self): @@ -1950,16 +1840,16 @@ def items(self): def iteritems(self): return self.items() - def __len__(self): + def __len__(self) -> int: """Returns length of info axis""" return len(self._info_axis) - def __contains__(self, key): + def __contains__(self, key) -> bool_t: """True if the key is in the info axis""" return key in self._info_axis @property - def empty(self): + def empty(self) -> bool_t: """ Indicator whether DataFrame is empty. @@ -2014,10 +1904,15 @@ def empty(self): # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__ = 1000 - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return com.values_from_object(self) def __array_wrap__(self, result, context=None): + result = lib.item_from_zerodim(result) + if is_scalar(result): + # e.g. we get here with np.ptp(series) + # ptp also requires the item_from_zerodim + return result d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) return self._constructor(result, **d).__finalize__(self) @@ -2029,37 +1924,17 @@ def __array_wrap__(self, result, context=None): # values = self.values # return dict(typestr=values.dtype.str,shape=values.shape,data=values) - def to_dense(self): - """ - Return dense representation of Series/DataFrame (as opposed to sparse). - - .. deprecated:: 0.25.0 - - Returns - ------- - %(klass)s - Dense %(klass)s. - """ - warnings.warn( - "DataFrame/Series.to_dense is deprecated " - "and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - # compat - return self - # ---------------------------------------------------------------------- # Picklability - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} return dict( _data=self._data, _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, - **meta + **meta, ) def __setstate__(self, state): @@ -2088,29 +1963,19 @@ def __setstate__(self, state): else: self._unpickle_series_compat(state) - elif isinstance(state[0], dict): - if len(state) == 5: - self._unpickle_sparse_frame_compat(state) - else: - self._unpickle_frame_compat(state) - elif len(state) == 4: - self._unpickle_panel_compat(state) elif len(state) == 2: self._unpickle_series_compat(state) - else: # pragma: no cover - # old pickling format, for compatibility - self._unpickle_matrix_compat(state) self._item_cache = {} # ---------------------------------------------------------------------- # Rendering Methods - def __repr__(self): + def __repr__(self) -> str: # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) - prepr = "[%s]" % ",".join(map(pprint_thing, self)) - return "%s(%s)" % (self.__class__.__name__, prepr) + prepr = f"[{','.join(map(pprint_thing, self))}]" + return f"{type(self).__name__}({prepr})" def _repr_latex_(self): """ @@ -2137,6 +2002,30 @@ def _repr_data_resource_(self): # ---------------------------------------------------------------------- # I/O Methods + _shared_docs[ + "to_markdown" + ] = """ + Print %(klass)s in Markdown-friendly format. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + mode : str, optional + Mode in which file is opened. + **kwargs + These parameters will be passed to `tabulate`. + + Returns + ------- + str + %(klass)s in Markdown-friendly format. + """ + _shared_docs[ "to_excel" ] = """ @@ -2266,7 +2155,7 @@ def to_excel( inf_rep="inf", verbose=True, freeze_panes=None, - ): + ) -> None: df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter @@ -2474,7 +2363,23 @@ def to_json( indent=indent, ) - def to_hdf(self, path_or_buf, key, **kwargs): + def to_hdf( + self, + path_or_buf, + key: str, + mode: str = "a", + complevel: Optional[int] = None, + complib: Optional[str] = None, + append: bool_t = False, + format: Optional[str] = None, + index: bool_t = True, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + dropna: Optional[bool_t] = None, + data_columns: Optional[List[str]] = None, + errors: str = "strict", + encoding: str = "UTF-8", + ) -> None: """ Write the contained data to an HDF5 file using HDFStore. @@ -2502,21 +2407,6 @@ def to_hdf(self, path_or_buf, key, **kwargs): - 'a': append, an existing file is opened for reading and writing, and if the file does not exist it is created. - 'r+': similar to 'a', but the file must already exist. - format : {'fixed', 'table'}, default 'fixed' - Possible values: - - - 'fixed': Fixed format. Fast writing/reading. Not-appendable, - nor searchable. - - 'table': Table format. Write as a PyTables Table structure - which may perform worse but allow more flexible operations - like searching / selecting subsets of the data. - append : bool, default False - For Table formats, append the input data to the existing. - data_columns : list of columns or True, optional - List of columns to create as indexed data columns for on-disk - queries, or True to use all columns. By default only the axes - of the object are indexed. See :ref:`io.hdf5-query-data-columns`. - Applicable only to format='table'. complevel : {0-9}, optional Specifies a compression level for data. A value of 0 disables compression. @@ -2528,14 +2418,33 @@ def to_hdf(self, path_or_buf, key, **kwargs): 'blosc:zlib', 'blosc:zstd'}. Specifying a compression library which is not available issues a ValueError. - fletcher32 : bool, default False - If applying compression use the fletcher32 checksum. - dropna : bool, default False - If true, ALL nan rows will not be written to store. + append : bool, default False + For Table formats, append the input data to the existing. + format : {'fixed', 'table', None}, default 'fixed' + Possible values: + + - 'fixed': Fixed format. Fast writing/reading. Not-appendable, + nor searchable. + - 'table': Table format. Write as a PyTables Table structure + which may perform worse but allow more flexible operations + like searching / selecting subsets of the data. + - If None, pd.get_option('io.hdf.default_format') is checked, + followed by fallback to "fixed" errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + encoding : str, default "UTF-8" + min_itemsize : dict or int, optional + Map column names to minimum string sizes for columns. + nan_rep : Any, optional + How to represent null values as str. + Not allowed with append=True. + data_columns : list of columns or True, optional + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See :ref:`io.hdf5-query-data-columns`. + Applicable only to format='table'. See Also -------- @@ -2577,62 +2486,36 @@ def to_hdf(self, path_or_buf, key, **kwargs): """ from pandas.io import pytables - pytables.to_hdf(path_or_buf, key, self, **kwargs) - - def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): - """ - Serialize object to input file path using msgpack format. - - .. deprecated:: 0.25.0 - - to_msgpack is deprecated and will be removed in a future version. - It is recommended to use pyarrow for on-the-wire transmission of - pandas objects. - - Example pyarrow usage: - - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({'A': [1, 2, 3]}) - >>> context = pa.default_serialization_context() - >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() - - For documentation on pyarrow, see `here - `__. - - Parameters - ---------- - path : str, buffer-like, or None - Destination for the serialized object. - If None, return generated bytes. - append : bool, default False - Whether to append to an existing msgpack. - compress : str, default None - Type of compressor (zlib, blosc or None). - - Returns - ------- - None or bytes - If path_or_buf is None, returns the resulting msgpack format as a - byte string. Otherwise returns None. - """ - - from pandas.io import packers - - return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) + pytables.to_hdf( + path_or_buf, + key, + self, + mode=mode, + complevel=complevel, + complib=complib, + append=append, + format=format, + index=index, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + dropna=dropna, + data_columns=data_columns, + errors=errors, + encoding=encoding, + ) def to_sql( self, name: str, con, schema=None, - if_exists="fail", - index=True, + if_exists: str = "fail", + index: bool_t = True, index_label=None, chunksize=None, dtype=None, method=None, - ): + ) -> None: """ Write records stored in a DataFrame to a SQL database. @@ -2645,7 +2528,11 @@ def to_sql( Name of SQL table. con : sqlalchemy.engine.Engine or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that - library. Legacy support is provided for sqlite3.Connection objects. + library. Legacy support is provided for sqlite3.Connection objects. The user + is responsible for engine disposal and connection closure for the SQLAlchemy + connectable See `here \ + `_ + schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. @@ -2775,7 +2662,12 @@ def to_sql( method=method, ) - def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): + def to_pickle( + self, + path, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, + ) -> None: """ Pickle (serialize) object to file. @@ -2831,7 +2723,9 @@ def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL) to_pickle(self, path, compression=compression, protocol=protocol) - def to_clipboard(self, excel=True, sep=None, **kwargs): + def to_clipboard( + self, excel: bool_t = True, sep: Optional[str] = None, **kwargs + ) -> None: r""" Copy object to the system clipboard. @@ -2910,12 +2804,12 @@ def to_xarray(self): Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), + >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2), ... ('parrot', 'bird', 24.0, 2), - ... ('lion', 'mammal', 80.5, 4), + ... ('lion', 'mammal', 80.5, 4), ... ('monkey', 'mammal', np.nan, 4)], - ... columns=['name', 'class', 'max_speed', - ... 'num_legs']) + ... columns=['name', 'class', 'max_speed', + ... 'num_legs']) >>> df name class max_speed num_legs 0 falcon bird 389.0 2 @@ -2943,10 +2837,11 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01', ... '2018-01-02', '2018-01-02']) >>> df_multiindex = pd.DataFrame({'date': dates, - ... 'animal': ['falcon', 'parrot', 'falcon', - ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}).set_index(['date', - ... 'animal']) + ... 'animal': ['falcon', 'parrot', + ... 'falcon', 'parrot'], + ... 'speed': [350, 18, 361, 15]}) + >>> df_multiindex = df_multiindex.set_index(['date', 'animal']) + >>> df_multiindex speed date animal @@ -3221,7 +3116,7 @@ def to_csv( and mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. - .. versionchanged:: 0.25.0 + .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other entries as additional compression options if @@ -3316,8 +3211,11 @@ def to_csv( # Fancy Indexing @classmethod - def _create_indexer(cls, name, indexer): - """Create an indexer like _name in the class.""" + def _create_indexer(cls, name: str, indexer) -> None: + """Create an indexer like _name in the class. + + Kept for compatibility with geopandas. To be removed in the future. See GH27258 + """ if getattr(cls, name, None) is None: _indexer = functools.partial(indexer, name) setattr(cls, name, property(_indexer, doc=indexer.__doc__)) @@ -3325,24 +3223,24 @@ def _create_indexer(cls, name, indexer): # ---------------------------------------------------------------------- # Lookup Caching - def _set_as_cached(self, item, cacher): + def _set_as_cached(self, item, cacher) -> None: """Set the _cacher attribute on the calling object with a weakref to cacher. """ self._cacher = (item, weakref.ref(cacher)) - def _reset_cacher(self): + def _reset_cacher(self) -> None: """Reset the cacher.""" if hasattr(self, "_cacher"): del self._cacher - def _maybe_cache_changed(self, item, value): + def _maybe_cache_changed(self, item, value) -> None: """The object has called back to us saying maybe it has changed. """ self._data.set(item, value) @property - def _is_cached(self): + def _is_cached(self) -> bool_t: """Return boolean indicating if self is cached or not.""" return getattr(self, "_cacher", None) is not None @@ -3353,7 +3251,9 @@ def _get_cacher(self): cacher = cacher[1]() return cacher - def _maybe_update_cacher(self, clear=False, verify_is_copy=True): + def _maybe_update_cacher( + self, clear: bool_t = False, verify_is_copy: bool_t = True + ) -> None: """ See if we need to update our parent cacher if clear, then clear our cache. @@ -3390,13 +3290,15 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): if clear: self._clear_item_cache() - def _clear_item_cache(self): + def _clear_item_cache(self) -> None: self._item_cache.clear() # ---------------------------------------------------------------------- # Indexing Methods - def take(self, indices, axis=0, is_copy=True, **kwargs): + def take( + self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs + ) -> FrameOrSeries: """ Return the elements in the given *positional* indices along an axis. @@ -3413,6 +3315,8 @@ def take(self, indices, axis=0, is_copy=True, **kwargs): selecting rows, ``1`` means that we are selecting columns. is_copy : bool, default True Whether to return a copy of the original object or not. + + .. deprecated:: 1.0.0 **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the output. @@ -3430,12 +3334,12 @@ def take(self, indices, axis=0, is_copy=True, **kwargs): Examples -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), ... ('monkey', 'mammal', np.nan)], - ... columns=['name', 'class', 'max_speed'], - ... index=[0, 2, 3, 1]) + ... columns=['name', 'class', 'max_speed'], + ... index=[0, 2, 3, 1]) >>> df name class max_speed 0 falcon bird 389.0 @@ -3471,6 +3375,16 @@ class max_speed 1 monkey mammal NaN 3 lion mammal 80.5 """ + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future.", + FutureWarning, + stacklevel=2, + ) + else: + is_copy = True + nv.validate_take(tuple(), kwargs) self._consolidate_inplace() @@ -3487,7 +3401,7 @@ class max_speed return result - def xs(self, key, axis=0, level=None, drop_level=True): + def xs(self, key, axis=0, level=None, drop_level: bool_t = True): """ Return cross-section from the Series/DataFrame. @@ -3592,9 +3506,9 @@ class animal locomotion loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) # create the tuple of the indexer - indexer = [slice(None)] * self.ndim - indexer[axis] = loc - indexer = tuple(indexer) + _indexer = [slice(None)] * self.ndim + _indexer[axis] = loc + indexer = tuple(_indexer) result = self.iloc[indexer] setattr(result, result._get_axis_name(axis), new_ax) @@ -3613,7 +3527,7 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: - inds, = loc.nonzero() + (inds,) = loc.nonzero() return self.take(inds, axis=axis) else: return self.take(loc, axis=axis) @@ -3648,7 +3562,7 @@ class animal locomotion result._set_is_copy(self, copy=not result._is_view) return result - _xs = xs # type: Callable + _xs: Callable = xs def __getitem__(self, item): raise AbstractMethodError(self) @@ -3679,7 +3593,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slobj: slice, axis=0, kind=None): + def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries: """ Construct a slice of this container. @@ -3695,11 +3609,11 @@ def _slice(self, slobj: slice, axis=0, kind=None): result._set_is_copy(self, copy=is_copy) return result - def _set_item(self, key, value): + def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() - def _set_is_copy(self, ref=None, copy=True): + def _set_is_copy(self, ref=None, copy: bool_t = True) -> None: if not copy: self._is_copy = None else: @@ -3708,7 +3622,7 @@ def _set_is_copy(self, ref=None, copy=True): else: self._is_copy = None - def _check_is_chained_assignment_possible(self): + def _check_is_chained_assignment_possible(self) -> bool_t: """ Check if we are a view, have a cacher, and are of mixed type. If so, then force a setitem_copy check. @@ -3785,7 +3699,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3796,7 +3710,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "DataFrame.\n" "Try using .loc[row_indexer,col_indexer] = value " "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3805,7 +3719,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): elif value == "warn": warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) - def __delitem__(self, key): + def __delitem__(self, key) -> None: """ Delete item """ @@ -3866,7 +3780,14 @@ def _is_view(self): """Return boolean indicating if self is view of another array """ return self._data.is_view - def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): + def reindex_like( + self: FrameOrSeries, + other, + method: Optional[str] = None, + copy: bool_t = True, + limit=None, + tolerance=None, + ) -> FrameOrSeries: """ Return an object with matching indices as other object. @@ -3930,9 +3851,10 @@ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None ... [31, 87.8, 'high'], ... [22, 71.6, 'medium'], ... [35, 95, 'medium']], - ... columns=['temp_celsius', 'temp_fahrenheit', 'windspeed'], - ... index=pd.date_range(start='2014-02-12', - ... end='2014-02-15', freq='D')) + ... columns=['temp_celsius', 'temp_fahrenheit', + ... 'windspeed'], + ... index=pd.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) >>> df1 temp_celsius temp_fahrenheit windspeed @@ -3944,9 +3866,9 @@ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None >>> df2 = pd.DataFrame([[28, 'low'], ... [30, 'low'], ... [35.1, 'medium']], - ... columns=['temp_celsius', 'windspeed'], - ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', - ... '2014-02-15'])) + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) >>> df2 temp_celsius windspeed @@ -3978,8 +3900,8 @@ def drop( index=None, columns=None, level=None, - inplace=False, - errors="raise", + inplace: bool_t = False, + errors: str = "raise", ): inplace = validate_bool_kwarg(inplace, "inplace") @@ -4007,7 +3929,9 @@ def drop( else: return obj - def _drop_axis(self, labels, axis, level=None, errors="raise"): + def _drop_axis( + self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" + ) -> FrameOrSeries: """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -4045,13 +3969,13 @@ def _drop_axis(self, labels, axis, level=None, errors="raise"): # GH 18561 MultiIndex.drop should raise if label is absent if errors == "raise" and indexer.all(): - raise KeyError("{} not found in axis".format(labels)) + raise KeyError(f"{labels} not found in axis") else: indexer = ~axis.isin(labels) # Check if label doesn't exist along axis labels_missing = (axis.get_indexer_for(labels) == -1).any() if errors == "raise" and labels_missing: - raise KeyError("{} not found in axis".format(labels)) + raise KeyError(f"{labels} not found in axis") slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer @@ -4060,7 +3984,7 @@ def _drop_axis(self, labels, axis, level=None, errors="raise"): return result - def _update_inplace(self, result, verify_is_copy=True): + def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: """ Replace self internals with result. @@ -4077,7 +4001,7 @@ def _update_inplace(self, result, verify_is_copy=True): self._data = getattr(result, "_data", result) self._maybe_update_cacher(verify_is_copy=verify_is_copy) - def add_prefix(self, prefix): + def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: """ Prefix labels with string `prefix`. @@ -4116,7 +4040,7 @@ def add_prefix(self, prefix): item_3 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -4134,9 +4058,9 @@ def add_prefix(self, prefix): f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore - def add_suffix(self, suffix): + def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ Suffix labels with string `suffix`. @@ -4175,7 +4099,7 @@ def add_suffix(self, suffix): 3_item 4 dtype: int64 - >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) + >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) >>> df A B 0 1 3 @@ -4193,16 +4117,17 @@ def add_suffix(self, suffix): f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore def sort_values( self, by=None, axis=0, ascending=True, - inplace=False, - kind="quicksort", - na_position="last", + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool_t = False, ): """ Sort by the values along either axis. @@ -4225,6 +4150,10 @@ def sort_values( na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4297,11 +4226,12 @@ def sort_index( self, axis=0, level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, + ascending: bool_t = True, + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool_t = True, + ignore_index: bool_t = False, ): """ Sort object by labels (along an axis). @@ -4328,6 +4258,10 @@ def sort_index( sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4351,7 +4285,7 @@ def sort_index( new_axis = labels.take(sort_index) return self.reindex(**{axis_name: new_axis}) - def reindex(self, *args, **kwargs): + def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: """ Conform %(klass)s to new index with optional filling logic. @@ -4424,10 +4358,9 @@ def reindex(self, *args, **kwargs): Create a dataframe with some fictional data. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - >>> df = pd.DataFrame({ - ... 'http_status': [200,200,404,404,301], - ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, - ... index=index) + >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + ... index=index) >>> df http_status response_time Firefox 200 0.04 @@ -4440,8 +4373,8 @@ def reindex(self, *args, **kwargs): values in the new index that do not have corresponding records in the dataframe are assigned ``NaN``. - >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', - ... 'Chrome'] + >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', + ... 'Chrome'] >>> df.reindex(new_index) http_status response_time Safari 404.0 0.07 @@ -4575,7 +4508,7 @@ def reindex(self, *args, **kwargs): if kwargs: raise TypeError( "reindex() got an unexpected keyword " - 'argument "{0}"'.format(list(kwargs.keys())[0]) + f'argument "{list(kwargs.keys())[0]}"' ) self._consolidate_inplace() @@ -4600,7 +4533,9 @@ def reindex(self, *args, **kwargs): axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self) - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + def _reindex_axes( + self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy + ) -> FrameOrSeries: """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4623,7 +4558,7 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy) return obj - def _needs_reindex_multi(self, axes, method, level): + def _needs_reindex_multi(self, axes, method, level) -> bool_t: """Check if we do need a multi reindex.""" return ( (com.count_not_none(*axes.values()) == self._AXIS_LEN) @@ -4636,8 +4571,12 @@ def _reindex_multi(self, axes, copy, fill_value): raise AbstractMethodError(self) def _reindex_with_indexers( - self, reindexers, fill_value=None, copy=False, allow_dups=False - ): + self: FrameOrSeries, + reindexers, + fill_value=None, + copy: bool_t = False, + allow_dups: bool_t = False, + ) -> FrameOrSeries: """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated @@ -4668,7 +4607,13 @@ def _reindex_with_indexers( return self._constructor(new_data).__finalize__(self) - def filter(self, items=None, like=None, regex=None, axis=None): + def filter( + self: FrameOrSeries, + items=None, + like: Optional[str] = None, + regex: Optional[str] = None, + axis=None, + ) -> FrameOrSeries: """ Subset the dataframe rows or columns according to the specified index labels. @@ -4759,7 +4704,7 @@ def f(x): else: raise TypeError("Must pass either `items`, `like`, or `regex`") - def head(self, n=5): + def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the first `n` rows. @@ -4767,6 +4712,9 @@ def head(self, n=5): on position. It is useful for quickly testing if your object has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + Parameters ---------- n : int, default 5 @@ -4774,7 +4722,7 @@ def head(self, n=5): Returns ------- - obj_head : same type as caller + same type as caller The first `n` rows of the caller object. See Also @@ -4783,7 +4731,7 @@ def head(self, n=5): Examples -------- - >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal @@ -4814,11 +4762,22 @@ def head(self, n=5): 0 alligator 1 bee 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot """ return self.iloc[:n] - def tail(self, n=5): + def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the last `n` rows. @@ -4826,6 +4785,9 @@ def tail(self, n=5): position. It is useful for quickly verifying data, for example, after sorting or appending rows. + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + Parameters ---------- n : int, default 5 @@ -4842,7 +4804,7 @@ def tail(self, n=5): Examples -------- - >>> df = pd.DataFrame({'animal':['alligator', 'bee', 'falcon', 'lion', + >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion', ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']}) >>> df animal @@ -4873,6 +4835,17 @@ def tail(self, n=5): 6 shark 7 whale 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra """ if n == 0: @@ -4880,14 +4853,14 @@ def tail(self, n=5): return self.iloc[-n:] def sample( - self, + self: FrameOrSeries, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None, - ): + ) -> FrameOrSeries: """ Return a random sample of items from an axis of object. @@ -4933,6 +4906,10 @@ def sample( numpy.random.choice: Generates a random sample from a given 1-D numpy array. + Notes + ----- + If `frac` > 1, `replacement` should be set to `True`. + Examples -------- >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0], @@ -4963,6 +4940,20 @@ def sample( dog 4 0 2 fish 0 0 8 + An upsample sample of the ``DataFrame`` with replacement: + Note that `replace` parameter has to be `True` for `frac` parameter > 1. + + >>> df.sample(frac=2, replace=True, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + falcon 2 2 10 + falcon 2 2 10 + fish 0 0 8 + dog 4 0 2 + fish 0 0 8 + dog 4 0 2 + Using a DataFrame column as weights. Rows with larger value in the `num_specimen_seen` column are more likely to be sampled. @@ -5038,6 +5029,11 @@ def sample( # If no frac or n, default to n=1. if n is None and frac is None: n = 1 + elif frac is not None and frac > 1 and not replace: + raise ValueError( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) elif n is not None and frac is None and n % 1 != 0: raise ValueError("Only integers accepted as `n` values") elif n is None and frac is not None: @@ -5052,7 +5048,7 @@ def sample( ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis, is_copy=False) + return self.take(locs, axis=axis) _shared_docs[ "pipe" @@ -5225,7 +5221,9 @@ def pipe(self, func, *args, **kwargs): # ---------------------------------------------------------------------- # Attribute access - def __finalize__(self, other, method=None, **kwargs): + def __finalize__( + self: FrameOrSeries, other, method=None, **kwargs + ) -> FrameOrSeries: """ Propagate metadata from other to self. @@ -5245,7 +5243,7 @@ def __finalize__(self, other, method=None, **kwargs): object.__setattr__(self, name, getattr(other, name, None)) return self - def __getattr__(self, name): + def __getattr__(self, name: str): """After regular attribute access, try looking up the name This allows simpler access to columns for interactive use. """ @@ -5264,7 +5262,7 @@ def __getattr__(self, name): return self[name] return object.__getattribute__(self, name) - def __setattr__(self, name, value): + def __setattr__(self, name: str, value) -> None: """After regular attribute access, try setting the name This allows simpler access to columns for interactive use. """ @@ -5329,7 +5327,7 @@ def _protect_consolidate(self, f): self._clear_item_cache() return result - def _consolidate_inplace(self): + def _consolidate_inplace(self) -> None: """Consolidate data in place and return None""" def f(): @@ -5337,7 +5335,7 @@ def f(): self._protect_consolidate(f) - def _consolidate(self, inplace=False): + def _consolidate(self, inplace: bool_t = False): """ Compute NDFrame with "consolidated" internals (data of each dtype grouped together in a single ndarray). @@ -5374,7 +5372,7 @@ def _is_datelike_mixed_type(self): f = lambda: self._data.is_datelike_mixed_type return self._protect_consolidate(f) - def _check_inplace_setting(self, value): + def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ if self._is_mixed_type: @@ -5400,56 +5398,8 @@ def _get_bool_data(self): # ---------------------------------------------------------------------- # Internal Interface Methods - def as_matrix(self, columns=None): - """ - Convert the frame to its Numpy-array representation. - - .. deprecated:: 0.23.0 - Use :meth:`DataFrame.values` instead. - - Parameters - ---------- - columns : list, optional, default:None - If None, return all columns, otherwise, returns specified columns. - - Returns - ------- - values : ndarray - If the caller is heterogeneous and contains booleans or objects, - the result will be of dtype=object. See Notes. - - See Also - -------- - DataFrame.values - - Notes - ----- - Return is NOT a Numpy-matrix, rather, a Numpy-array. - - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a float64 dtype. - - This method is provided for backwards compatibility. Generally, - it is recommended to use '.values'. - """ - warnings.warn( - "Method .as_matrix will be removed in a future version. " - "Use .values instead.", - FutureWarning, - stacklevel=2, - ) - self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns) - @property - def values(self): + def values(self) -> np.ndarray: """ Return a Numpy representation of the DataFrame. @@ -5526,280 +5476,68 @@ def values(self): return self._data.as_array(transpose=self._AXIS_REVERSED) @property - def _values(self): + def _values(self) -> np.ndarray: """internal implementation""" return self.values @property - def _get_values(self): + def _get_values(self) -> np.ndarray: # compat return self.values - def get_values(self): + def _internal_get_values(self) -> np.ndarray: """ Return an ndarray after converting sparse values to dense. - .. deprecated:: 0.25.0 - Use ``np.asarray(..)`` or :meth:`DataFrame.values` instead. - This is the same as ``.values`` for non-sparse data. For sparse data contained in a `SparseArray`, the data are first converted to a dense representation. Returns ------- - numpy.ndarray - Numpy representation of DataFrame. - - See Also - -------- - values : Numpy representation of DataFrame. - SparseArray : Container for sparse data. - - Examples - -------- - >>> df = pd.DataFrame({'a': [1, 2], 'b': [True, False], - ... 'c': [1.0, 2.0]}) - >>> df - a b c - 0 1 True 1.0 - 1 2 False 2.0 - - >>> df.get_values() - array([[1, True, 1.0], [2, False, 2.0]], dtype=object) - - >>> df = pd.DataFrame({"a": pd.SparseArray([1, None, None]), - ... "c": [1.0, 2.0, 3.0]}) - >>> df - a c - 0 1.0 1.0 - 1 NaN 2.0 - 2 NaN 3.0 - - >>> df.get_values() - array([[ 1., 1.], - [nan, 2.], - [nan, 3.]]) - """ - warnings.warn( - "The 'get_values' method is deprecated and will be removed in a " - "future version. Use '.values' or 'np.asarray(..)' instead.", - FutureWarning, - stacklevel=2, - ) - return self._internal_get_values() - - def _internal_get_values(self): - return self.values - - def get_dtype_counts(self): - """ - Return counts of unique dtypes in this object. - - .. deprecated:: 0.25.0 - - Use `.dtypes.value_counts()` instead. - - Returns - ------- - dtype : Series - Series with the count of columns with each dtype. - - See Also - -------- - dtypes : Return the dtypes in this object. - - Examples - -------- - >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]] - >>> df = pd.DataFrame(a, columns=['str', 'int', 'float']) - >>> df - str int float - 0 a 1 1.0 - 1 b 2 2.0 - 2 c 3 3.0 - - >>> df.get_dtype_counts() - float64 1 - int64 1 - object 1 - dtype: int64 - """ - warnings.warn( - "`get_dtype_counts` has been deprecated and will be " - "removed in a future version. For DataFrames use " - "`.dtypes.value_counts()", - FutureWarning, - stacklevel=2, - ) - - from pandas import Series - - return Series(self._data.get_dtype_counts()) - - def get_ftype_counts(self): - """ - Return counts of unique ftypes in this object. - - .. deprecated:: 0.23.0 - - Returns - ------- - dtype : Series - Series with the count of columns with each type and - sparsity (dense/sparse). - - See Also - -------- - ftypes : Return ftypes (indication of sparse/dense and dtype) in - this object. - - Examples - -------- - >>> a = [['a', 1, 1.0], ['b', 2, 2.0], ['c', 3, 3.0]] - >>> df = pd.DataFrame(a, columns=['str', 'int', 'float']) - >>> df - str int float - 0 a 1 1.0 - 1 b 2 2.0 - 2 c 3 3.0 - - >>> df.get_ftype_counts() # doctest: +SKIP - float64:dense 1 - int64:dense 1 - object:dense 1 - dtype: int64 - """ - warnings.warn( - "get_ftype_counts is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - - from pandas import Series - - return Series(self._data.get_ftype_counts()) - - @property - def dtypes(self): - """ - Return the dtypes in the DataFrame. - - This returns a Series with the data type of each column. - The result's index is the original DataFrame's columns. Columns - with mixed types are stored with the ``object`` dtype. See - :ref:`the User Guide ` for more. - - Returns - ------- - pandas.Series - The data type of each column. - - See Also - -------- - DataFrame.ftypes : Dtype and sparsity information. - - Examples - -------- - >>> df = pd.DataFrame({'float': [1.0], - ... 'int': [1], - ... 'datetime': [pd.Timestamp('20180310')], - ... 'string': ['foo']}) - >>> df.dtypes - float float64 - int int64 - datetime datetime64[ns] - string object - dtype: object - """ - from pandas import Series - - return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) - - @property - def ftypes(self): - """ - Return the ftypes (indication of sparse/dense and dtype) in DataFrame. - - .. deprecated:: 0.25.0 - Use :func:`dtypes` instead. - - This returns a Series with the data type of each column. - The result's index is the original DataFrame's columns. Columns - with mixed types are stored with the ``object`` dtype. See - :ref:`the User Guide ` for more. - - Returns - ------- - pandas.Series - The data type and indication of sparse/dense of each column. - - See Also - -------- - DataFrame.dtypes: Series with just dtype information. - - Notes - ----- - Sparse data should have the same dtypes as its dense representation. - - Examples - -------- - >>> arr = np.random.RandomState(0).randn(100, 4) - >>> arr[arr < .8] = np.nan - >>> pd.DataFrame(arr).ftypes - 0 float64:dense - 1 float64:dense - 2 float64:dense - 3 float64:dense - dtype: object - """ - warnings.warn( - "DataFrame.ftypes is deprecated and will " - "be removed in a future version. " - "Use DataFrame.dtypes instead.", - FutureWarning, - stacklevel=2, - ) - - from pandas import Series - - return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_) - - def as_blocks(self, copy=True): - """ - Convert the frame to a dict of dtype -> Constructor Types. - - .. deprecated:: 0.21.0 - - NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in - as_matrix) - - Parameters - ---------- - copy : bool, default True - - Returns - ------- - dict - Mapping dtype -> Constructor Types. + numpy.ndarray + Numpy representation of DataFrame. + + See Also + -------- + values : Numpy representation of DataFrame. + SparseArray : Container for sparse data. """ - warnings.warn( - "as_blocks is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - return self._to_dict_of_blocks(copy=copy) + return self.values @property - def blocks(self): + def dtypes(self): """ - Internal property, property synonym for as_blocks(). + Return the dtypes in the DataFrame. + + This returns a Series with the data type of each column. + The result's index is the original DataFrame's columns. Columns + with mixed types are stored with the ``object`` dtype. See + :ref:`the User Guide ` for more. - .. deprecated:: 0.21.0 + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> df = pd.DataFrame({'float': [1.0], + ... 'int': [1], + ... 'datetime': [pd.Timestamp('20180310')], + ... 'string': ['foo']}) + >>> df.dtypes + float float64 + int int64 + datetime datetime64[ns] + string object + dtype: object """ - return self.as_blocks() + from pandas import Series + + return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) - def _to_dict_of_blocks(self, copy=True): + def _to_dict_of_blocks(self, copy: bool_t = True): """ Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. @@ -5811,7 +5549,9 @@ def _to_dict_of_blocks(self, copy=True): for k, v, in self._data.to_dict(copy=copy).items() } - def astype(self, dtype, copy=True, errors="raise"): + def astype( + self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" + ) -> FrameOrSeries: """ Cast a pandas object to a specified dtype ``dtype``. @@ -5937,10 +5677,10 @@ def astype(self, dtype, copy=True, errors="raise"): elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: use iloc to handle duplicate column names - results = ( + results = [ self.iloc[:, i].astype(dtype, copy=copy) for i in range(len(self.columns)) - ) + ] else: # else, only a single dtype is given @@ -5952,7 +5692,7 @@ def astype(self, dtype, copy=True, errors="raise"): result.columns = self.columns return result - def copy(self, deep=True): + def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: """ Make a copy of this object's indices and data. @@ -6060,23 +5800,26 @@ def copy(self, deep=True): data = self._data.copy(deep=deep) return self._constructor(data).__finalize__(self) - def __copy__(self, deep=True): + def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self.copy(deep=deep) - def __deepcopy__(self, memo=None): + def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: """ Parameters ---------- memo, default None Standard signature. Unused """ - if memo is None: - memo = {} return self.copy(deep=True) def _convert( - self, datetime=False, numeric=False, timedelta=False, coerce=False, copy=True - ): + self: FrameOrSeries, + datetime: bool_t = False, + numeric: bool_t = False, + timedelta: bool_t = False, + coerce: bool_t = False, + copy: bool_t = True, + ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -6116,7 +5859,7 @@ def _convert( ) ).__finalize__(self) - def infer_objects(self): + def infer_objects(self: FrameOrSeries) -> FrameOrSeries: """ Attempt to infer better dtypes for object columns. @@ -6168,14 +5911,14 @@ def infer_objects(self): # Filling NA's def fillna( - self, + self: FrameOrSeries, value=None, method=None, axis=None, - inplace=False, + inplace: bool_t = False, limit=None, downcast=None, - ): + ) -> Optional[FrameOrSeries]: """ Fill NA/NaN values using the specified method. @@ -6211,8 +5954,8 @@ def fillna( Returns ------- - %(klass)s - Object with missing values filled. + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. See Also -------- @@ -6309,16 +6052,16 @@ def fillna( if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): - from pandas import Series - - value = Series(value) + value = create_series_with_explicit_dtype( + value, dtype_if_empty=object + ) elif not is_list_like(value): pass else: raise TypeError( '"value" parameter must be a scalar, dict ' "or Series, but you passed a " - '"{0}"'.format(type(value).__name__) + f'"{type(value).__name__}"' ) new_data = self._data.fillna( @@ -6348,34 +6091,47 @@ def fillna( elif isinstance(value, ABCDataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: - raise ValueError("invalid fill value with a %s" % type(value)) + raise ValueError(f"invalid fill value with a {type(value)}") if inplace: self._update_inplace(new_data) + return None else: return self._constructor(new_data).__finalize__(self) - def ffill(self, axis=None, inplace=False, limit=None, downcast=None): + def ffill( + self: FrameOrSeries, + axis=None, + inplace: bool_t = False, + limit=None, + downcast=None, + ) -> Optional[FrameOrSeries]: """ Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. Returns ------- - %(klass)s - Object with missing values filled. + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. """ return self.fillna( method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast ) - def bfill(self, axis=None, inplace=False, limit=None, downcast=None): + def bfill( + self: FrameOrSeries, + axis=None, + inplace: bool_t = False, + limit=None, + downcast=None, + ) -> Optional[FrameOrSeries]: """ Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. Returns ------- - %(klass)s - Object with missing values filled. + %(klass)s or None + Object with missing values filled or None if ``inplace=True``. """ return self.fillna( method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast @@ -6706,8 +6462,8 @@ def replace( if not is_dict_like(to_replace): if not is_dict_like(regex): raise TypeError( - 'If "to_replace" and "value" are both None' - ' and "to_replace" is not a list, then ' + 'If "to_replace" and "value" are both None ' + 'and "to_replace" is not a list, then ' "regex must be a mapping" ) to_replace = regex @@ -6721,9 +6477,8 @@ def replace( if any(are_mappings): if not all(are_mappings): raise TypeError( - "If a nested mapping is passed, all values" - " of the top level mapping must be " - "mappings" + "If a nested mapping is passed, all values " + "of the top level mapping must be mappings" ) # passed a nested dict/Series to_rep_dict = {} @@ -6785,9 +6540,8 @@ def replace( if is_list_like(value): if len(to_replace) != len(value): raise ValueError( - "Replacement lists must match " - "in length. Expecting %d got %d " - % (len(to_replace), len(value)) + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) new_data = self._data.replace_list( @@ -6808,11 +6562,9 @@ def replace( or is_dict_like(regex) ): raise TypeError( - "'regex' must be a string or a compiled " - "regular expression or a list or dict of " - "strings or regular expressions, you " - "passed a" - " {0!r}".format(type(regex).__name__) + f"'regex' must be a string or a compiled regular expression " + f"or a list or dict of strings or regular expressions, " + f"you passed a {repr(type(regex).__name__)}" ) return self.replace( regex, value, inplace=inplace, limit=limit, regex=True @@ -6838,10 +6590,9 @@ def replace( to_replace=to_replace, value=value, inplace=inplace, regex=regex ) else: - msg = ('Invalid "to_replace" type: ' "{0!r}").format( - type(to_replace).__name__ + raise TypeError( + f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' ) - raise TypeError(msg) # pragma: no cover if inplace: self._update_inplace(new_data) @@ -7041,7 +6792,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -7093,9 +6844,9 @@ def interpolate( if method not in methods and not is_numeric_or_datetime: raise ValueError( "Index column must be numeric or datetime type when " - "using {method} method other than linear. " + f"using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method) + "interpolating." ) if isna(index).any(): @@ -7115,7 +6866,7 @@ def interpolate( limit_area=limit_area, inplace=inplace, downcast=downcast, - **kwargs + **kwargs, ) if inplace: @@ -7247,14 +6998,13 @@ def asof(self, where, subset=None): if not is_list: start = self.index[0] if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal + where = Period(where, freq=self.index.freq) if where < start: if not is_series: from pandas import Series - return Series(index=self.columns, name=where) + return Series(index=self.columns, name=where, dtype=np.float64) return np.nan # It's always much faster to use a *while* loop here for @@ -7293,7 +7043,8 @@ def asof(self, where, subset=None): # mask the missing missing = locs == -1 - data = self.take(locs, is_copy=False) + d = self.take(locs) + data = d.copy() data.index = where data.loc[missing] = np.nan return data if is_list else data.iloc[-1] @@ -7364,11 +7115,11 @@ def asof(self, where, subset=None): """ @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) _shared_docs[ @@ -7434,14 +7185,14 @@ def isnull(self): """ @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) - def _clip_with_scalar(self, lower, upper, inplace=False): + def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if (lower is not None and np.any(isna(lower))) or ( upper is not None and np.any(isna(upper)) ): @@ -7489,7 +7240,15 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): threshold = _align_method_FRAME(self, threshold, axis) return self.where(subset, threshold, axis=axis, inplace=inplace) - def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): + def clip( + self: FrameOrSeries, + lower=None, + upper=None, + axis=None, + inplace: bool_t = False, + *args, + **kwargs, + ) -> FrameOrSeries: """ Trim values at input threshold(s). @@ -7603,222 +7362,10 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs return result - def clip_upper(self, threshold, axis=None, inplace=False): - """ - Trim values above a given threshold. - - .. deprecated:: 0.24.0 - Use clip(upper=threshold) instead. - - Elements above the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Maximum value allowed. All values above threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align object with `threshold` along the given axis. - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - dtype: int64 - - >>> s.clip(upper=3) - 0 1 - 1 2 - 2 3 - 3 3 - 4 3 - dtype: int64 - - >>> elemwise_thresholds = [5, 4, 3, 2, 1] - >>> elemwise_thresholds - [5, 4, 3, 2, 1] - - >>> s.clip(upper=elemwise_thresholds) - 0 1 - 1 2 - 2 3 - 3 2 - 4 1 - dtype: int64 - """ - warnings.warn( - "clip_upper(threshold) is deprecated, use clip(upper=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.le, axis=axis, inplace=inplace - ) - - def clip_lower(self, threshold, axis=None, inplace=False): - """ - Trim values below a given threshold. - - .. deprecated:: 0.24.0 - Use clip(lower=threshold) instead. - - Elements below the `threshold` will be changed to match the - `threshold` value(s). Threshold can be a single value or an array, - in the latter case it performs the truncation element-wise. - - Parameters - ---------- - threshold : numeric or array-like - Minimum value allowed. All values below threshold will be set to - this value. - - * float : every value is compared to `threshold`. - * array-like : The shape of `threshold` should match the object - it's compared to. When `self` is a Series, `threshold` should be - the length. When `self` is a DataFrame, `threshold` should 2-D - and the same shape as `self` for ``axis=None``, or 1-D and the - same length as the axis being compared. - - axis : {0 or 'index', 1 or 'columns'}, default 0 - Align `self` with `threshold` along the given axis. - - inplace : bool, default False - Whether to perform the operation in place on the data. - - .. versionadded:: 0.21.0 - - Returns - ------- - Series or DataFrame - Original data with values trimmed. - - See Also - -------- - Series.clip : General purpose method to trim Series values to given - threshold(s). - DataFrame.clip : General purpose method to trim DataFrame values to - given threshold(s). - - Examples - -------- - - Series single threshold clipping: - - >>> s = pd.Series([5, 6, 7, 8, 9]) - >>> s.clip(lower=8) - 0 8 - 1 8 - 2 8 - 3 8 - 4 9 - dtype: int64 - - Series clipping element-wise using an array of thresholds. `threshold` - should be the same length as the Series. - - >>> elemwise_thresholds = [4, 8, 7, 2, 5] - >>> s.clip(lower=elemwise_thresholds) - 0 5 - 1 8 - 2 7 - 3 8 - 4 9 - dtype: int64 - - DataFrames can be compared to a scalar. - - >>> df = pd.DataFrame({"A": [1, 3, 5], "B": [2, 4, 6]}) - >>> df - A B - 0 1 2 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=3) - A B - 0 3 3 - 1 3 4 - 2 5 6 - - Or to an array of values. By default, `threshold` should be the same - shape as the DataFrame. - - >>> df.clip(lower=np.array([[3, 4], [2, 2], [6, 2]])) - A B - 0 3 4 - 1 3 4 - 2 6 6 - - Control how `threshold` is broadcast with `axis`. In this case - `threshold` should be the same length as the axis specified by - `axis`. - - >>> df.clip(lower=[3, 3, 5], axis='index') - A B - 0 3 3 - 1 3 4 - 2 5 6 - - >>> df.clip(lower=[4, 5], axis='columns') - A B - 0 4 5 - 1 4 5 - 2 5 6 - """ - warnings.warn( - "clip_lower(threshold) is deprecated, use clip(lower=threshold) instead", - FutureWarning, - stacklevel=2, - ) - return self._clip_with_one_bound( - threshold, method=self.ge, axis=axis, inplace=inplace - ) - - def groupby( - self, - by=None, - axis=0, - level=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - observed=False, - **kwargs - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7861,15 +7408,10 @@ def groupby( .. versionadded:: 0.23.0 - **kwargs - Optional, only accepts keyword argument 'mutated' and is passed - to groupby. - Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7879,72 +7421,17 @@ def groupby( Notes ----- See the `user guide - `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - return groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - **kwargs - ) + `_ for more. + """ - def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): + def asfreq( + self: FrameOrSeries, + freq, + method=None, + how: Optional[str] = None, + normalize: bool_t = False, + fill_value=None, + ) -> FrameOrSeries: """ Convert TimeSeries to specified frequency. @@ -7983,7 +7470,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -8047,7 +7534,9 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): fill_value=fill_value, ) - def at_time(self, time, asof=False, axis=None): + def at_time( + self: FrameOrSeries, time, asof: bool_t = False, axis=None + ) -> FrameOrSeries: """ Select values at particular time of day (e.g. 9:30AM). @@ -8104,8 +7593,13 @@ def at_time(self, time, asof=False, axis=None): return self.take(indexer, axis=axis) def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None - ): + self: FrameOrSeries, + start_time, + end_time, + include_start: bool_t = True, + include_end: bool_t = True, + axis=None, + ) -> FrameOrSeries: """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -8183,16 +7677,13 @@ def between_time( def resample( self, rule, - how=None, axis=0, - fill_method=None, - closed=None, - label=None, - convention="start", - kind=None, + closed: Optional[str] = None, + label: Optional[str] = None, + convention: str = "start", + kind: Optional[str] = None, loffset=None, - limit=None, - base=0, + base: int = 0, on=None, level=None, ): @@ -8208,22 +7699,10 @@ def resample( ---------- rule : DateOffset, Timedelta or str The offset string or object representing target conversion. - how : str - Method for down/re-sampling, default to 'mean' for downsampling. - - .. deprecated:: 0.18.0 - The new syntax is ``.resample(...).mean()``, or - ``.resample(...).apply()`` axis : {0 or 'index', 1 or 'columns'}, default 0 Which axis to use for up- or down-sampling. For `Series` this will default to 0, i.e. along the rows. Must be `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`. - fill_method : str, default None - Filling method for upsampling. - - .. deprecated:: 0.18.0 - The new syntax is ``.resample(...).()``, - e.g. ``.resample(...).pad()`` closed : {'right', 'left'}, default None Which side of bin interval is closed. The default is 'left' for all frequency offsets except for 'M', 'A', 'Q', 'BM', @@ -8241,10 +7720,6 @@ def resample( By default the input representation is retained. loffset : timedelta, default None Adjust the resampled time labels. - limit : int, default None - Maximum size gap when reindexing with `fill_method`. - - .. deprecated:: 0.18.0 base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could @@ -8274,7 +7749,7 @@ def resample( for more. To learn more about the offset strings, please see `this link - `__. + `__. Examples -------- @@ -8476,10 +7951,10 @@ def resample( 2000-01-04 36 90 """ - from pandas.core.resample import resample, _maybe_process_deprecations + from pandas.core.resample import resample axis = self._get_axis_number(axis) - r = resample( + return resample( self, freq=rule, label=label, @@ -8492,11 +7967,8 @@ def resample( key=on, level=level, ) - return _maybe_process_deprecations( - r, how=how, fill_method=fill_method, limit=limit - ) - def first(self, offset): + def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset initial periods of time series data based on a date offset. @@ -8551,14 +8023,14 @@ def first(self, offset): end_date = end = self.index[0] + offset # Tick-like, e.g. 3 weeks - if not offset.isAnchored() and hasattr(offset, "_inc"): + if not offset.is_anchored() and hasattr(offset, "_inc"): if end_date in self.index: end = self.index.searchsorted(end_date, side="left") return self.iloc[:end] return self.loc[:end] - def last(self, offset): + def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset final periods of time series data based on a date offset. @@ -8584,7 +8056,7 @@ def last(self, offset): Examples -------- >>> i = pd.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i) + >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> ts A 2018-04-09 1 @@ -8616,14 +8088,14 @@ def last(self, offset): return self.iloc[start:] def rank( - self, + self: FrameOrSeries, axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, - ): + method: str = "average", + numeric_only: Optional[bool_t] = None, + na_option: str = "keep", + ascending: bool_t = True, + pct: bool_t = False, + ) -> FrameOrSeries: """ Compute numerical data ranks (1 through n) along axis. @@ -8867,7 +8339,7 @@ def align( fill_axis=fill_axis, ) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def _align_frame( self, @@ -8875,7 +8347,7 @@ def _align_frame( join="outer", axis=None, level=None, - copy=True, + copy: bool_t = True, fill_value=None, method=None, limit=None, @@ -8917,8 +8389,12 @@ def _align_frame( ) if method is not None: - left = left.fillna(axis=fill_axis, method=method, limit=limit) - right = right.fillna(axis=fill_axis, method=method, limit=limit) + left = self._ensure_type( + left.fillna(method=method, axis=fill_axis, limit=limit) + ) + right = self._ensure_type( + right.fillna(method=method, axis=fill_axis, limit=limit) + ) # if DatetimeIndex have different tz, convert to UTC if is_datetime64tz_dtype(left.index): @@ -8935,7 +8411,7 @@ def _align_series( join="outer", axis=None, level=None, - copy=True, + copy: bool_t = True, fill_value=None, method=None, limit=None, @@ -9411,7 +8887,9 @@ def mask( """ @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: if periods == 0: return self.copy() @@ -9425,7 +8903,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._constructor(new_data).__finalize__(self) - def slice_shift(self, periods=1, axis=0): + def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ Equivalent to `shift` without copying data. @@ -9462,7 +8940,9 @@ def slice_shift(self, periods=1, axis=0): return new_obj.__finalize__(self) - def tshift(self, periods=1, freq=None, axis=0): + def tshift( + self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + ) -> FrameOrSeries: """ Shift the time index, using the index's frequency if available. @@ -9510,10 +8990,10 @@ def tshift(self, periods=1, freq=None, axis=0): if freq == orig_freq: new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) - else: - msg = "Given freq %s does not match PeriodIndex freq %s" % ( - freq.rule_code, - orig_freq.rule_code, + elif orig_freq is not None: + msg = ( + f"Given freq {freq.rule_code} does not match" + f" PeriodIndex freq {orig_freq.rule_code}" ) raise ValueError(msg) else: @@ -9522,7 +9002,9 @@ def tshift(self, periods=1, freq=None, axis=0): return self._constructor(new_data).__finalize__(self) - def truncate(self, before=None, after=None, axis=None, copy=True): + def truncate( + self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True + ) -> FrameOrSeries: """ Truncate a Series or DataFrame before and after some index value. @@ -9561,7 +9043,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'], ... 'B': ['f', 'g', 'h', 'i', 'j'], ... 'C': ['k', 'l', 'm', 'n', 'o']}, - ... index=[1, 2, 3, 4, 5]) + ... index=[1, 2, 3, 4, 5]) >>> df A B C 1 a f k @@ -9640,7 +9122,6 @@ def truncate(self, before=None, after=None, axis=None, copy=True): 2016-01-10 23:59:58 1 2016-01-10 23:59:59 1 """ - if axis is None: axis = self._stat_axis_number axis = self._get_axis_number(axis) @@ -9661,7 +9142,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): if before is not None and after is not None: if before > after: - raise ValueError("Truncate: %s must be after %s" % (after, before)) + raise ValueError(f"Truncate: {after} must be after {before}") slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) @@ -9675,7 +9156,9 @@ def truncate(self, before=None, after=None, axis=None, copy=True): return result - def tz_convert(self, tz, axis=0, level=None, copy=True): + def tz_convert( + self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True + ) -> FrameOrSeries: """ Convert tz-aware axis to target time zone. @@ -9707,7 +9190,7 @@ def _tz_convert(ax, tz): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) @@ -9723,7 +9206,7 @@ def _tz_convert(ax, tz): ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): - raise ValueError("The level {0} is not valid".format(level)) + raise ValueError(f"The level {level} is not valid") ax = _tz_convert(ax, tz) result = self._constructor(self._data, copy=copy) @@ -9731,8 +9214,14 @@ def _tz_convert(ax, tz): return result.__finalize__(self) def tz_localize( - self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" - ): + self: FrameOrSeries, + tz, + axis=0, + level=None, + copy: bool_t = True, + ambiguous="raise", + nonexistent: str = "raise", + ) -> FrameOrSeries: """ Localize tz-naive index of a Series or DataFrame to target time zone. @@ -9795,7 +9284,7 @@ def tz_localize( Localize local times: >>> s = pd.Series([1], - ... index=pd.DatetimeIndex(['2018-09-15 01:30:00'])) + ... index=pd.DatetimeIndex(['2018-09-15 01:30:00'])) >>> s.tz_localize('CET') 2018-09-15 01:30:00+02:00 1 dtype: int64 @@ -9803,14 +9292,14 @@ def tz_localize( Be careful with DST changes. When there is sequential data, pandas can infer the DST time: - >>> s = pd.Series(range(7), index=pd.DatetimeIndex([ - ... '2018-10-28 01:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 02:00:00', - ... '2018-10-28 02:30:00', - ... '2018-10-28 03:00:00', - ... '2018-10-28 03:30:00'])) + >>> s = pd.Series(range(7), + ... index=pd.DatetimeIndex(['2018-10-28 01:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 02:00:00', + ... '2018-10-28 02:30:00', + ... '2018-10-28 03:00:00', + ... '2018-10-28 03:30:00'])) >>> s.tz_localize('CET', ambiguous='infer') 2018-10-28 01:30:00+02:00 0 2018-10-28 02:00:00+02:00 1 @@ -9824,10 +9313,10 @@ def tz_localize( In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly - >>> s = pd.Series(range(3), index=pd.DatetimeIndex([ - ... '2018-10-28 01:20:00', - ... '2018-10-28 02:36:00', - ... '2018-10-28 03:46:00'])) + >>> s = pd.Series(range(3), + ... index=pd.DatetimeIndex(['2018-10-28 01:20:00', + ... '2018-10-28 02:36:00', + ... '2018-10-28 03:46:00'])) >>> s.tz_localize('CET', ambiguous=np.array([True, True, False])) 2018-10-28 01:20:00+02:00 0 2018-10-28 02:36:00+02:00 1 @@ -9837,9 +9326,9 @@ def tz_localize( If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. - >>> s = pd.Series(range(2), index=pd.DatetimeIndex([ - ... '2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + >>> s = pd.Series(range(2), + ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', + ... '2015-03-29 03:30:00'])) >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 @@ -9858,9 +9347,9 @@ def tz_localize( nonexistent, timedelta ): raise ValueError( - "The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object" + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or " + "a timedelta object" ) axis = self._get_axis_number(axis) @@ -9871,7 +9360,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): if len(ax) > 0: ax_name = self._get_axis_name(axis) raise TypeError( - "%s is not a valid DatetimeIndex or PeriodIndex" % ax_name + f"{ax_name} is not a valid DatetimeIndex or PeriodIndex" ) else: ax = DatetimeIndex([], tz=tz) @@ -9887,7 +9376,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): - raise ValueError("The level {0} is not valid".format(level)) + raise ValueError(f"The level {level} is not valid") ax = _tz_localize(ax, tz, ambiguous, nonexistent) result = self._constructor(self._data, copy=copy) @@ -9896,7 +9385,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): # ---------------------------------------------------------------------- # Numeric Methods - def abs(self): + def abs(self: FrameOrSeries) -> FrameOrSeries: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -9965,7 +9454,9 @@ def abs(self): """ return np.abs(self) - def describe(self, percentiles=None, include=None, exclude=None): + def describe( + self: FrameOrSeries, percentiles=None, include=None, exclude=None + ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -10301,7 +9792,7 @@ def describe_1d(data): ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows - names = [] + names: List[Optional[Hashable]] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: @@ -10430,19 +9921,29 @@ def describe_1d(data): """ @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): + def pct_change( + self: FrameOrSeries, + periods=1, + fill_method="pad", + limit=None, + freq=None, + **kwargs, + ) -> FrameOrSeries: # TODO: Not sure if above is correct - need someone to confirm. axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self else: - data = self.fillna(method=fill_method, limit=limit, axis=axis) + data = self._ensure_type( + self.fillna(method=fill_method, axis=axis, limit=limit) + ) rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 - rs = rs.reindex_like(data) - if freq is None: - mask = isna(com.values_from_object(data)) - np.putmask(rs.values, mask, np.nan) + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): @@ -10550,29 +10051,6 @@ def mad(self, axis=None, skipna=None, level=None): nanops.nanstd, ) - @Substitution( - desc="Return the compound percentage of the values for " - "the requested axis.\n\n.. deprecated:: 0.25.0", - name1=name, - name2=name2, - axis_descr=axis_descr, - min_count="", - see_also="", - examples="", - ) - @Appender(_num_doc) - def compound(self, axis=None, skipna=None, level=None): - msg = ( - "The 'compound' method is deprecated and will be" - "removed in a future version." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - if skipna is None: - skipna = True - return (1 + self).prod(axis=axis, skipna=skipna, level=level) - 1 - - cls.compound = compound - cls.cummin = _make_cum_function( cls, "cummin", @@ -10580,7 +10058,7 @@ def compound(self, axis=None, skipna=None, level=None): name2, axis_descr, "minimum", - lambda y, axis: np.minimum.accumulate(y, axis), + np.minimum.accumulate, "min", np.inf, np.nan, @@ -10593,7 +10071,7 @@ def compound(self, axis=None, skipna=None, level=None): name2, axis_descr, "sum", - lambda y, axis: y.cumsum(axis), + np.cumsum, "sum", 0.0, np.nan, @@ -10606,7 +10084,7 @@ def compound(self, axis=None, skipna=None, level=None): name2, axis_descr, "product", - lambda y, axis: y.cumprod(axis), + np.cumprod, "prod", 1.0, np.nan, @@ -10619,7 +10097,7 @@ def compound(self, axis=None, skipna=None, level=None): name2, axis_descr, "maximum", - lambda y, axis: np.maximum.accumulate(y, axis), + np.maximum.accumulate, "max", -np.inf, np.nan, @@ -10716,40 +10194,6 @@ def compound(self, axis=None, skipna=None, level=None): _min_examples, ) - @classmethod - def _add_series_only_operations(cls): - """ - Add the series only operations to the cls; evaluate the doc - strings again. - """ - - axis_descr, name, name2 = _doc_parms(cls) - - def nanptp(values, axis=0, skipna=True): - nmax = nanops.nanmax(values, axis, skipna) - nmin = nanops.nanmin(values, axis, skipna) - warnings.warn( - "Method .ptp is deprecated and will be removed " - "in a future version. Use numpy.ptp instead.", - FutureWarning, - stacklevel=4, - ) - return nmax - nmin - - cls.ptp = _make_stat_function( - cls, - "ptp", - name, - name2, - axis_descr, - """Return the difference between the min and max value. - \n.. deprecated:: 0.24.0 Use numpy.ptp instead - \nReturn the difference between the maximum value and the - minimum value in the object. This is the equivalent of the - ``numpy.ndarray`` method ``ptp``.""", - nanptp, - ) - @classmethod def _add_series_or_dataframe_operations(cls): """ @@ -10870,27 +10314,11 @@ def _find_valid_index(self, how: str): ------- idx_first_valid : type of index """ - assert how in ["first", "last"] - if len(self) == 0: # early stop + idxpos = find_valid_index(self._values, how) + if idxpos is None: return None - is_valid = ~self.isna() - - if self.ndim == 2: - is_valid = is_valid.any(1) # reduce axis 1 - - if how == "first": - idxpos = is_valid.values[::].argmax() - - if how == "last": - idxpos = len(self) - 1 - is_valid.values[::-1].argmax() - - chk_notna = is_valid.iat[idxpos] - idx = self.index[idxpos] - - if not chk_notna: - return None - return idx + return self.index[idxpos] @Appender( _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"} @@ -10907,8 +10335,8 @@ def last_valid_index(self): def _doc_parms(cls): """Return a tuple of the doc parms.""" - axis_descr = "{%s}" % ", ".join( - "{0} ({1})".format(a, i) for i, a in enumerate(cls._AXIS_ORDERS) + axis_descr = ( + f"{{{', '.join(f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS))}}}" ) name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar" name2 = cls.__name__ @@ -11565,7 +10993,7 @@ def _doc_parms(cls): def _make_min_count_stat_function( - cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" + cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" ): @Substitution( desc=desc, @@ -11584,7 +11012,7 @@ def stat_func( level=None, numeric_only=None, min_count=0, - **kwargs + **kwargs, ): if name == "sum": nv.validate_sum(tuple(), kwargs) @@ -11613,7 +11041,7 @@ def stat_func( def _make_stat_function( - cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" + cls, name, name1, name2, axis_descr, desc, f, see_also: str = "", examples: str = "" ): @Substitution( desc=desc, @@ -11696,19 +11124,64 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - y = com.values_from_object(self).copy() - - if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - result = accum_func(y, axis) - mask = isna(self) - np.putmask(result, mask, iNaT) - elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - mask = isna(self) - np.putmask(y, mask, mask_a) - result = accum_func(y, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(y, axis) + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) + + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result + + result = self._data.apply(na_accum_func) d = self._construct_axes_dict() d["copy"] = False @@ -11748,8 +11221,3 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs ) return set_function_name(logical_func, name, cls) - - -# install the indexes -for _name, _indexer in indexing.get_indexers_list(): - NDFrame._create_indexer(_name, _indexer) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 252f20ed40068..0c5d2658978b4 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,7 +1,11 @@ -from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, - NamedAgg, - SeriesGroupBy, -) -from pandas.core.groupby.groupby import GroupBy # noqa: F401 -from pandas.core.groupby.grouper import Grouper # noqa: F401 +from pandas.core.groupby.generic import DataFrameGroupBy, NamedAgg, SeriesGroupBy +from pandas.core.groupby.groupby import GroupBy +from pandas.core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index fc3bb69afd0cb..700d8d503d086 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -1,32 +1,20 @@ """ -Provide basic components for groupby. These defintiions +Provide basic components for groupby. These definitions hold the whitelist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ +import collections + from pandas.core.dtypes.common import is_list_like, is_scalar +OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) + class GroupByMixin: """ Provide the groupby facilities to the mixed object. """ - @staticmethod - def _dispatch(name, *args, **kwargs): - """ - Dispatch to apply. - """ - - def outer(self, *args, **kwargs): - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return getattr(x, name)(*args, **kwargs) - - return self._groupby.apply(f) - - outer.__name__ = name - return outer - def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. @@ -53,7 +41,7 @@ def _gotitem(self, key, ndim, subset=None): except IndexError: groupby = self._groupby - self = self.__class__(subset, groupby=groupby, parent=self, **kwargs) + self = type(self)(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index fcf52ecfcbbcd..399ed9ddc9ba1 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -8,7 +8,7 @@ ) -def recode_for_groupby(c, sort, observed): +def recode_for_groupby(c: Categorical, sort: bool, observed: bool): """ Code the categories to ensure we can groupby for categoricals. @@ -74,7 +74,7 @@ def recode_for_groupby(c, sort, observed): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c, sort, ci): +def recode_from_groupby(c: Categorical, sort: bool, ci): """ Reverse the codes_to_groupby to account for sort / observed. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f4c3ac970a3ca..c49677fa27a31 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,30 +5,32 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc, namedtuple +from collections import abc, defaultdict, namedtuple import copy -import functools from functools import partial from textwrap import dedent import typing from typing import ( + TYPE_CHECKING, Any, Callable, + Dict, FrozenSet, - Hashable, Iterable, - Optional, + List, + Mapping, Sequence, Tuple, Type, Union, + cast, ) import warnings import numpy as np from pandas._libs import Timestamp, lib -from pandas.compat import PY36 +from pandas._typing import FrameOrSeries from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import ( @@ -40,7 +42,6 @@ ensure_int64, ensure_platform_int, is_bool, - is_datetimelike, is_dict_like, is_integer_dtype, is_interval_dtype, @@ -48,13 +49,14 @@ is_numeric_dtype, is_object_dtype, is_scalar, + needs_i8_conversion, ) from pandas.core.dtypes.missing import _isna_ndarraylike, isna, notna -from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com +from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs from pandas.core.groupby import base @@ -62,15 +64,19 @@ GroupBy, _apply_docs, _transform_template, - groupby, + get_groupby, ) -from pandas.core.index import Index, MultiIndex, _all_indexes_same +from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.plotting import boxplot_frame_groupby +if TYPE_CHECKING: + from pandas.core.internals import Block + + NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. AggScalar = Union[str, Callable[..., Any]] @@ -143,8 +149,8 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: - yield self._selection_name, self._selected_obj + def _iterate_slices(self) -> Iterable[Series]: + yield self._selected_obj @property def _selection_name(self): @@ -226,17 +232,12 @@ def apply(self, func, *args, **kwargs): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None columns = None no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) - if not PY36: - # sort for 3.5 and earlier - columns = list(sorted(columns)) - func = [kwargs[col] for col in columns] kwargs = {} if not columns: @@ -245,11 +246,11 @@ def aggregate(self, func=None, *args, **kwargs): if isinstance(func, str): return getattr(self, func)(*args, **kwargs) - if isinstance(func, abc.Iterable): + elif isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. func = _maybe_mangle_lambdas(func) - ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) + ret = self._aggregate_multiple_funcs(func) if relabeling: ret.columns = columns else: @@ -262,24 +263,20 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except (AssertionError, TypeError): - raise - except (ValueError, KeyError, AttributeError, IndexError): - # TODO: IndexError can be removed here following GH#29106 - # TODO: AttributeError is caused by _index_data hijinx in - # libreduction, can be removed after GH#29160 + except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) - ret = Series(result, index=index) + ret = create_series_with_explicit_dtype( + result, index=index, dtype_if_empty=object + ) if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") - # _level handled at higher - if not _level and isinstance(ret, dict): + if isinstance(ret, dict): from pandas import concat ret = concat(ret, axis=1) @@ -287,23 +284,14 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _aggregate_multiple_funcs(self, arg, _level): + def _aggregate_multiple_funcs(self, arg): if isinstance(arg, dict): # show the deprecation, but only if we # have not shown a higher level one # GH 15931 - if isinstance(self._selected_obj, Series) and _level <= 1: - msg = dedent( - """\ - using a dict on a Series for aggregation - is deprecated and will be removed in a future version. Use \ - named aggregation instead. - - >>> grouper.agg(name_1=func_1, name_2=func_2) - """ - ) - warnings.warn(msg, FutureWarning, stacklevel=3) + if isinstance(self._selected_obj, Series): + raise SpecificationError("nested renamer is not supported") columns = list(arg.keys()) arg = arg.items() @@ -320,14 +308,9 @@ def _aggregate_multiple_funcs(self, arg, _level): arg = zip(columns, arg) - results = OrderedDict() + results = {} for name, func in arg: obj = self - if name in results: - raise SpecificationError( - "Function names must be unique, found multiple named " - "{}".format(name) - ) # reset the cache so that we # only include the named selection @@ -339,40 +322,102 @@ def _aggregate_multiple_funcs(self, arg, _level): if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle - if _level: - return results + return results return DataFrame(results, columns=columns) - def _wrap_series_output(self, output, index, names=None): - """ common agg/transform wrapping logic """ - output = output[self._selection_name] + def _wrap_series_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy operation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + index : pd.Index + Index to apply to the output. - if names is not None: - return DataFrame(output, index=index, columns=names) + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output and columns will only contain one + element. The exception is operations that expand dimensions, like ohlc. + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result: Union[Series, DataFrame] + if len(output) > 1: + result = DataFrame(indexed_output, index=index) + result.columns = columns else: - name = self._selection_name - if name is None: - name = self._selected_obj.name - return Series(output, index=index, name=name) + result = Series(indexed_output[0], index=index, name=columns[0]) + + return result + + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Union[Series, DataFrame]: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. - def _wrap_aggregated_output(self, output, names=None): + Returns + ------- + Series or DataFrame + + Notes + ----- + In the vast majority of cases output will only contain one element. + The exception is operations that expand dimensions, like ohlc. + """ result = self._wrap_series_output( - output=output, index=self.grouper.result_index, names=names + output=output, index=self.grouper.result_index ) return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): - return self._wrap_series_output( - output=output, index=self.obj.index, names=names - ) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> Series: + """ + Wraps the output of a SeriesGroupBy aggregation into the expected result. + + Parameters + ---------- + output : dict[base.OutputKey, Union[Series, np.ndarray]] + Dict with a sole key of 0 and a value of the result values. + + Returns + ------- + Series + + Notes + ----- + output should always contain one element. It is specified as a dict + for consistency with DataFrame methods and _wrap_aggregated_output. + """ + assert len(output) == 1 + result = self._wrap_series_output(output=output, index=self.obj.index) + + # No transformations increase the ndim of the result + assert isinstance(result, Series) + return result def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self._selection_name, index=keys) + return Series([], name=self._selection_name, index=keys, dtype=np.float64) - def _get_index(): + def _get_index() -> Index: if self.grouper.nkeys > 1: index = MultiIndex.from_tuples(keys, names=self.grouper.names) else: @@ -400,14 +445,14 @@ def _get_index(): return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): - result = OrderedDict() + result = {} for name, group in self: group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): raise ValueError("Must produce aggregated value") - result[name] = self._try_cast(output, group) + result[name] = output return result @@ -416,35 +461,39 @@ def _aggregate_named(self, func, *args, **kwargs): def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transform or canned "agg+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func - ) + if not isinstance(func, str): + return self._transform_general(func, *args, **kwargs) + + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transform or canned "agg+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + return self._transform_fast(result, func) + + def _transform_general(self, func, *args, **kwargs): + """ + Transform with a non-str `func`. + """ + klass = type(self._selected_obj) - # reg transform - klass = self._selected_obj.__class__ results = [] - wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: object.__setattr__(group, "name", name) - res = wrapper(group) + res = func(group, *args, **kwargs) if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values indexer = self._get_index(name) - s = klass(res, indexer) - results.append(s) + ser = klass(res, indexer) + results.append(ser) # check for empty "results" to avoid concat ValueError if results: @@ -452,10 +501,10 @@ def transform(self, func, *args, **kwargs): result = concat(results).sort_index() else: - result = Series() + result = Series(dtype=np.float64) # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* udfs + # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) dtype = self._selected_obj.dtype if is_numeric_dtype(dtype): @@ -465,17 +514,14 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func, func_nm): + def _transform_fast(self, result, func_nm: str) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ - if isinstance(func, str): - func = getattr(self, func) - ids, _, ngroup = self.grouper.group_info cast = self._transform_should_cast(func_nm) - out = algorithms.take_1d(func()._values, ids) + out = algorithms.take_1d(result._values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) @@ -515,7 +561,7 @@ def filter(self, func, dropna=True, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x, *args, **kwargs): + def true_and_notna(x, *args, **kwargs) -> bool: b = wrapper(x, *args, **kwargs) return b and notna(b) @@ -529,7 +575,7 @@ def true_and_notna(x, *args, **kwargs): filtered = self._apply_filter(indices, dropna) return filtered - def nunique(self, dropna=True): + def nunique(self, dropna: bool = True) -> Series: """ Return number of unique elements in the group. @@ -549,7 +595,7 @@ def nunique(self, dropna=True): try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = "val.dtype must be object, got {}".format(val.dtype) + msg = f"val.dtype must be object, got {val.dtype}" assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) @@ -591,7 +637,8 @@ def nunique(self, dropna=True): res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -657,16 +704,17 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + codes = self.grouper.reconstructed_codes + codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] if dropna: - mask = labels[-1] != -1 + mask = codes[-1] != -1 if mask.all(): dropna = False else: - out, labels = out[mask], [label[mask] for label in labels] + out, codes = out[mask], [level_codes[mask] for level_codes in codes] if normalize: out = out.astype("float") @@ -682,11 +730,11 @@ def value_counts( if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) - out, labels[-1] = out[sorter], labels[-1][sorter] + out, codes[-1] = out[sorter], codes[-1][sorter] if bins is None: mi = MultiIndex( - levels=levels, codes=labels, names=names, verify_integrity=False + levels=levels, codes=codes, names=names, verify_integrity=False ) if is_integer_dtype(out): @@ -696,14 +744,14 @@ def value_counts( # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros diff = np.zeros(len(out), dtype="bool") - for lab in labels[:-1]: - diff |= np.r_[True, lab[1:] != lab[:-1]] + for level_codes in codes[:-1]: + diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] ncat, nbin = diff.sum(), len(levels[-1]) left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - right = [diff.cumsum() - 1, labels[-1]] + right = [diff.cumsum() - 1, codes[-1]] _, idx = _get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) @@ -713,7 +761,10 @@ def value_counts( out, left[-1] = out[sorter], left[-1][sorter] # build the multi-index w/ full levels - codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + def build_codes(lev_codes: np.ndarray) -> np.ndarray: + return np.repeat(lev_codes[diff], nbin) + + codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] codes.append(left[-1]) mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) @@ -722,7 +773,7 @@ def value_counts( out = ensure_int64(out) return Series(out, index=mi, name=self._selection_name) - def count(self): + def count(self) -> Series: """ Compute count of group, excluding missing values. @@ -739,12 +790,13 @@ def count(self): minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + result = Series( out, index=self.grouper.result_index, name=self._selection_name, dtype="int64", ) + return self._reindex_output(result, fill_value=0) def _apply_to_column_groupbys(self, func): """ return a pass thru """ @@ -759,8 +811,11 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): periods=periods, fill_method=fill_method, limit=limit, freq=freq ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.labels) + fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 @@ -771,8 +826,6 @@ class DataFrameGroupBy(GroupBy): _apply_whitelist = base.dataframe_apply_whitelist - _block_agg_axis = 1 - _agg_see_also_doc = dedent( """ See Also @@ -864,20 +917,27 @@ class DataFrameGroupBy(GroupBy): ) @Appender(_shared_docs["aggregate"]) def aggregate(self, func=None, *args, **kwargs): - _level = kwargs.pop("_level", None) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: func, columns, order = _normalize_keyword_aggregation(kwargs) kwargs = {} + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) elif func is None: # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") func = _maybe_mangle_lambdas(func) - result, how = self._aggregate(func, _level=_level, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if how is None: return result @@ -888,31 +948,21 @@ def aggregate(self, func=None, *args, **kwargs): return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: result = self._aggregate_frame(func, *args, **kwargs) + + elif self.axis == 1: + # _aggregate_multiple_funcs does not allow self.axis == 1 + result = self._aggregate_frame(func) + else: # try to treat as if we are passing a list try: - result = self._aggregate_multiple_funcs( - [func], _level=_level, _axis=self.axis - ) + result = self._aggregate_multiple_funcs([func], _axis=self.axis) except ValueError as err: if "no results" not in str(err): # raised directly by _aggregate_multiple_funcs raise result = self._aggregate_frame(func) - except NotImplementedError as err: - if "axis other than 0 is not supported" in str(err): - # raised directly by _aggregate_multiple_funcs - pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for DecimalArray tests - pass - else: - raise - # FIXME: this is raised in a bunch of - # test_whitelist.test_regression_whitelist_methods tests, - # can be avoided - result = self._aggregate_frame(func) else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name @@ -932,41 +982,43 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: obj = self._selected_obj if self.axis == 1: obj = obj.T if isinstance(obj, Series) and obj.name not in self.exclusions: # Occurs when doing DataFrameGroupBy(...)["X"] - yield obj.name, obj + yield obj else: for label, values in obj.items(): if label in self.exclusions: continue - yield label, values + yield values - def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): - new_items, new_blocks = self._cython_agg_blocks( + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ) -> DataFrame: + agg_blocks, agg_items = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(new_items, new_blocks) + return self._wrap_agged_blocks(agg_blocks, items=agg_items) - _block_agg_axis = 0 - - def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): + def _cython_agg_blocks( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ) -> "Tuple[List[Block], Index]": # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine - data, agg_axis = self._get_data_to_aggregate() + data: BlockManager = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) - new_blocks = [] - new_items = [] - deleted_items = [] + agg_blocks: List[Block] = [] + new_items: List[np.ndarray] = [] + deleted_items: List[np.ndarray] = [] no_result = object() for block in data.blocks: # Avoid inheriting result from earlier in the loop @@ -974,7 +1026,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis, min_count=min_count + block.values, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -995,7 +1047,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = groupby(obj, self.grouper) + s = get_groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: @@ -1003,12 +1055,13 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue - - # unwrap DataFrame to get array - assert len(result._data.blocks) == 1 - result = result._data.blocks[0].values - if result.ndim == 1 and isinstance(result, np.ndarray): - result = result.reshape(1, -1) + else: + result = cast(DataFrame, result) + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) finally: assert not isinstance(result, DataFrame) @@ -1031,20 +1084,20 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - newb = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) - new_blocks.append(newb) + agg_blocks.append(agg_block) - if len(new_blocks) == 0: + if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) - new_items = data.items.take(np.sort(indexer)) + agg_items = data.items.take(np.sort(indexer)) - if len(deleted_items): + if deleted_items: # we need to adjust the indexer to account for the # items we have removed @@ -1057,40 +1110,39 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): indexer = (ai - mask.cumsum())[indexer] offset = 0 - for b in new_blocks: - loc = len(b.mgr_locs) - b.mgr_locs = indexer[offset : (offset + loc)] + for blk in agg_blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] offset += loc - return new_items, new_blocks + return agg_blocks, agg_items - def _aggregate_frame(self, func, *args, **kwargs): + def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: raise AssertionError("Number of keys must be 1") axis = self.axis obj = self._obj_with_exclusions - result = OrderedDict() + result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres else: for name in self.indices: data = self.get_group(name, obj=obj) fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres return self._wrap_frame_output(result, obj) - def _aggregate_item_by_item(self, func, *args, **kwargs): + def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result = OrderedDict() + result: Dict[Union[int, str], NDFrame] = {} cannot_agg = [] - errors = None for item in obj: data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) @@ -1116,27 +1168,8 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): if cannot_agg: result_columns = result_columns.drop(cannot_agg) - # GH6337 - if not len(result_columns) and errors is not None: - raise errors - return DataFrame(result, columns=result_columns) - def _decide_output_index(self, output, labels): - if len(output) == len(labels): - output_keys = labels - else: - output_keys = sorted(output) - try: - output_keys.sort() - except TypeError: - pass - - if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) - - return output_keys - def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) @@ -1186,17 +1219,25 @@ def first_not_none(values): if v is None: return DataFrame() elif isinstance(v, NDFrame): - values = [ - x if x is not None else v._constructor(**v._construct_axes_dict()) - for x in values - ] + + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = v._construct_axes_dict() + if v._constructor is Series: + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) + else: + backup = v._constructor(**kwargs) + + values = [x if (x is not None) else backup for x in values] v = values[0] if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same([x.index for x in values]) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1228,67 +1269,58 @@ def first_not_none(values): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - try: - if self.axis == 0 and isinstance(v, ABCSeries): - # GH6124 if the list of Series have a consistent name, - # then propagate that name to the result. - index = v.index.copy() - if index.name is None: - # Only propagate the series name to the result - # if all series have a consistent name. If the - # series do not have a consistent name, do - # nothing. - names = {v.name for v in values} - if len(names) == 1: - index.name = list(names)[0] - - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): + if self.axis == 0 and isinstance(v, ABCSeries): + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = {v.name for v in values} + if len(names) == 1: + index.name = list(names)[0] + + # normally use vstack as its faster than concat + # and if we have mi-columns + if ( + isinstance(v.index, MultiIndex) + or key_index is None + or isinstance(key_index, MultiIndex) + ): stacked_values = np.vstack([np.asarray(v) for v in values]) result = DataFrame( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=key_index, columns=index ) else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return Series( - values, index=key_index, name=self._selection_name - ) - - except ValueError: - # TODO: not reached in tests; is this still needed? - # GH1738: values is list of arrays of unequal lengths fall - # through to the outer else clause + # GH5788 instead of stacking; concat gets the + # dtypes correct + from pandas.core.reshape.concat import concat + + result = concat( + values, + keys=key_index, + names=key_index.names, + axis=self.axis, + ).unstack() + result.columns = index + elif isinstance(v, ABCSeries): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values.T, index=v.index, columns=key_index + ) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above return Series(values, index=key_index, name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(is_datetimelike).any(): + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): result = _recast_datetimelike_result(result) else: result = result._convert(datetime=True) @@ -1298,12 +1330,12 @@ def first_not_none(values): # values are not series or array-like but scalars else: # only coerce dates if we find at least 1 datetime - coerce = any(isinstance(x, Timestamp) for x in values) + should_coerce = any(isinstance(x, Timestamp) for x in values) # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns return Series(values, index=key_index)._convert( - datetime=True, coerce=coerce + datetime=True, coerce=should_coerce ) else: @@ -1371,21 +1403,21 @@ def transform(self, func, *args, **kwargs): # optimized transforms func = self._get_cython_func(func) or func - if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" - raise ValueError(msg.format(func=func)) - if func in base.cythonized_kernels: - # cythonized transformation or canned "reduction+broadcast" - return getattr(self, func)(*args, **kwargs) - else: - # If func is a reduction, we need to broadcast the - # result to the whole group. Compute func result - # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) - else: + if not isinstance(func, str): return self._transform_general(func, *args, **kwargs) + elif func not in base.transform_kernel_whitelist: + msg = f"'{func}' is not a valid function name for transform(name)" + raise ValueError(msg) + elif func in base.cythonized_kernels: + # cythonized transformation or canned "reduction+broadcast" + return getattr(self, func)(*args, **kwargs) + + # If func is a reduction, we need to broadcast the + # result to the whole group. Compute func result + # and deal with possible broadcasting below. + result = getattr(self, func)(*args, **kwargs) + # a reduction transform if not isinstance(result, DataFrame): return self._transform_general(func, *args, **kwargs) @@ -1396,9 +1428,9 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - return self._transform_fast(result, obj, func) + return self._transform_fast(result, func) - def _transform_fast(self, result, obj, func_nm): + def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: """ Fast transform path for aggregations """ @@ -1406,12 +1438,16 @@ def _transform_fast(self, result, obj, func_nm): # try casting data to original dtype cast = self._transform_should_cast(func_nm) + obj = self._obj_with_exclusions + # for each col, reshape to to size of original frame # by take operation ids, _, ngroup = self.grouper.group_info output = [] for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) + # TODO: we have no test cases that get here with EA dtypes; + # try_cast may not be needed if EAs never get here if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) @@ -1431,7 +1467,7 @@ def _define_paths(self, func, *args, **kwargs): ) return fast_path, slow_path - def _choose_path(self, fast_path, slow_path, group): + def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame): path = slow_path res = slow_path(group) @@ -1441,8 +1477,8 @@ def _choose_path(self, fast_path, slow_path, group): except AssertionError: raise except Exception: - # Hard to know ex-ante what exceptions `fast_path` might raise - # TODO: no test cases get here + # GH#29631 For user-defined function, we cant predict what may be + # raised; see test_transform.test_transform_fastpath_raises return path, res # verify fast path does not change columns (and names), otherwise @@ -1458,15 +1494,13 @@ def _choose_path(self, fast_path, slow_path, group): return path, res - def _transform_item_by_item(self, obj, wrapper): + def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: # iterate through columns output = {} inds = [] for i, col in enumerate(obj): try: output[col] = self[col].transform(wrapper) - except AssertionError: - raise except TypeError: # e.g. trying to call nanmean with string values pass @@ -1539,13 +1573,26 @@ def filter(self, func, dropna=True, *args, **kwargs): else: # non scalars aren't allowed raise TypeError( - "filter function returned a %s, " - "but expected a scalar bool" % type(res).__name__ + f"filter function returned a {type(res).__name__}, " + "but expected a scalar bool" ) return self._apply_filter(indices, dropna) - def _gotitem(self, key, ndim, subset=None): + def __getitem__(self, key): + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise warning + warnings.warn( + "Indexing with multiple keys (implicitly converted to a tuple " + "of keys) will be deprecated, use a list instead.", + FutureWarning, + stacklevel=2, + ) + return super().__getitem__(key) + + def _gotitem(self, key, ndim: int, subset=None): """ sub-classes to define return a sliced object @@ -1580,7 +1627,7 @@ def _gotitem(self, key, ndim, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj): + def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1588,12 +1635,12 @@ def _wrap_frame_output(self, result, obj): else: return DataFrame(result, index=obj.index, columns=result_index) - def _get_data_to_aggregate(self): + def _get_data_to_aggregate(self) -> BlockManager: obj = self._obj_with_exclusions if self.axis == 1: - return obj.T._data, 1 + return obj.T._data else: - return obj._data, 1 + return obj._data def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 @@ -1612,39 +1659,74 @@ def _insert_inaxis_grouper_inplace(self, result): if in_axis: result.insert(0, name, lev) - def _wrap_aggregated_output(self, output, names=None): - agg_axis = 0 if self.axis == 1 else 1 - agg_labels = self._obj_with_exclusions._get_axis(agg_axis) + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy aggregations into the expected result. - output_keys = self._decide_output_index(output, agg_labels) + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns if not self.as_index: - result = DataFrame(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - result = DataFrame(output, index=index, columns=output_keys) + result.index = index if self.axis == 1: result = result.T return self._reindex_output(result)._convert(datetime=True) - def _wrap_transformed_output(self, output, names=None): - return DataFrame(output, index=self.obj.index) + def _wrap_transformed_output( + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + ) -> DataFrame: + """ + Wraps the output of DataFrameGroupBy transformations into the expected result. + + Parameters + ---------- + output : Mapping[base.OutputKey, Union[Series, np.ndarray]] + Data to wrap. + + Returns + ------- + DataFrame + """ + indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + + result = DataFrame(indexed_output) + result.columns = columns + result.index = self.obj.index + + return result - def _wrap_agged_blocks(self, items, blocks): + def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) - mgr = BlockManager(blocks, [items, index]) + mgr = BlockManager(blocks, axes=[items, index]) result = DataFrame(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - mgr = BlockManager(blocks, [items, index]) + mgr = BlockManager(blocks, axes=[items, index]) result = DataFrame(mgr) if self.axis == 1: @@ -1679,22 +1761,24 @@ def count(self): DataFrame Count of values within each group. """ - data, _ = self._get_data_to_aggregate() + data = self._get_data_to_aggregate() ids, _, ngroups = self.grouper.group_info mask = ids != -1 - val = ( + vals = ( (mask & ~_isna_ndarraylike(np.atleast_2d(blk.get_values()))) for blk in data.blocks ) - loc = (blk.mgr_locs for blk in data.blocks) + locs = (blk.mgr_locs for blk in data.blocks) - counter = partial(lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1) - blk = map(make_block, map(counter, val), loc) + counted = ( + lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals + ) + blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(data.items, list(blk)) + return self._wrap_agged_blocks(blocks, items=data.items) - def nunique(self, dropna=True): + def nunique(self, dropna: bool = True): """ Return DataFrame with number of distinct observations per group for each column. @@ -1750,11 +1834,24 @@ def groupby_series(obj, col=None): if isinstance(obj, Series): results = groupby_series(obj) else: + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions from pandas.core.reshape.concat import concat - results = [groupby_series(obj[col], col) for col in obj.columns] + axis_number = obj._get_axis_number(self.axis) + other_axis = int(not axis_number) + if axis_number == 0: + iter_func = obj.items + else: + iter_func = obj.iterrows + + results = [groupby_series(content, label) for label, content in iter_func()] results = concat(results, axis=1) - results.columns.names = obj.columns.names + + if axis_number == 1: + results = results.T + + results._get_axis(other_axis).names = obj._get_axis(other_axis).names if not self.as_index: results.index = ibase.default_index(len(results)) @@ -1763,7 +1860,7 @@ def groupby_series(obj, col=None): boxplot = boxplot_frame_groupby -def _is_multi_agg_with_relabel(**kwargs): +def _is_multi_agg_with_relabel(**kwargs) -> bool: """ Check whether kwargs passed to .agg look like multi-agg with relabeling. @@ -1785,15 +1882,17 @@ def _is_multi_agg_with_relabel(**kwargs): >>> _is_multi_agg_with_relabel() False """ - return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and ( + len(kwargs) > 0 + ) def _normalize_keyword_aggregation(kwargs): """ Normalize user-provided "named aggregation" kwargs. - Transforms from the new ``Dict[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. + Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs + to the old Dict[str, List[scalar]]]. Parameters ---------- @@ -1811,27 +1910,19 @@ def _normalize_keyword_aggregation(kwargs): Examples -------- >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) """ - if not PY36: - kwargs = OrderedDict(sorted(kwargs.items())) - - # Normalize the aggregation functions as Dict[column, List[func]], + # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ - aggspec = OrderedDict() + aggspec = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) for name, (column, aggfunc) in zip(columns, pairs): - if column in aggspec: - aggspec[column].append(aggfunc) - else: - aggspec[column] = [aggfunc] + aggspec[column].append(aggfunc) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) # uniquify aggfunc name if duplicated in order list @@ -1900,8 +1991,8 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: mangled_aggfuncs = [] for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "": - aggfunc = functools.partial(aggfunc) - aggfunc.__name__ = "".format(i) + aggfunc = partial(aggfunc) + aggfunc.__name__ = f"" i += 1 mangled_aggfuncs.append(aggfunc) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e53e7ffdbf72f..233bdd11b372b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4,17 +4,28 @@ class providing the base-class of operations. The SeriesGroupBy and DataFrameGroupBy sub-class (defined in pandas.core.groupby.generic) -expose these user-facing objects to provide specific functionailty. +expose these user-facing objects to provide specific functionality. """ -import collections from contextlib import contextmanager import datetime from functools import partial, wraps import inspect import re import types -from typing import FrozenSet, Hashable, Iterable, List, Optional, Tuple, Type, Union +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + Iterable, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) import numpy as np @@ -22,17 +33,16 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby +from pandas._typing import FrameOrSeries, Scalar from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.util._validators import validate_kwargs from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, @@ -43,14 +53,13 @@ class providing the base-class of operations. from pandas.core import nanops import pandas.core.algorithms as algorithms -from pandas.core.arrays import Categorical +from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.groupby import base, ops +from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter @@ -227,7 +236,7 @@ class providing the base-class of operations. Notes ----- See more `here -`_ +`_ Examples -------- @@ -316,7 +325,7 @@ def f(self): f.__name__ = "plot" return self._groupby.apply(f) - def __getattr__(self, name): + def __getattr__(self, name: str): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) @@ -336,25 +345,34 @@ def _group_selection_context(groupby): groupby._reset_group_selection() +_KeysArgType = Union[ + Hashable, + List[Hashable], + Callable[[Hashable], Hashable], + List[Callable[[Hashable], Hashable]], + Mapping[Hashable, Hashable], +] + + class _GroupBy(PandasObject, SelectionMixin): _group_selection = None - _apply_whitelist = frozenset() # type: FrozenSet[str] + _apply_whitelist: FrozenSet[str] = frozenset() def __init__( self, obj: NDFrame, - keys=None, - axis=0, + keys: Optional[_KeysArgType] = None, + axis: int = 0, level=None, - grouper=None, + grouper: "Optional[ops.BaseGrouper]" = None, exclusions=None, selection=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - observed=False, - **kwargs + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, ): self._selection = selection @@ -376,12 +394,12 @@ def __init__( self.group_keys = group_keys self.squeeze = squeeze self.observed = observed - self.mutated = kwargs.pop("mutated", False) + self.mutated = mutated if grouper is None: - from pandas.core.groupby.grouper import _get_grouper + from pandas.core.groupby.grouper import get_grouper - grouper, exclusions, obj = _get_grouper( + grouper, exclusions, obj = get_grouper( obj, keys, axis=axis, @@ -396,13 +414,10 @@ def __init__( self.grouper = grouper self.exclusions = set(exclusions) if exclusions else set() - # we accept no other args - validate_kwargs("group", kwargs, {}) - - def __len__(self): + def __len__(self) -> int: return len(self.groups) - def __repr__(self): + def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) @@ -443,7 +458,7 @@ def _get_indices(self, names): def get_converter(s): # possibly convert to the actual key types # in the indices, could be a Timestamp or a np.datetime64 - if isinstance(s, (Timestamp, datetime.datetime)): + if isinstance(s, datetime.datetime): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 @@ -470,8 +485,8 @@ def get_converter(s): except KeyError: # turns out it wasn't a tuple msg = ( - "must supply a same-length tuple to get_group" - " with multiple grouping keys" + "must supply a same-length tuple to get_group " + "with multiple grouping keys" ) raise ValueError(msg) @@ -492,6 +507,7 @@ def _get_index(self, name): @cache_readonly def _selected_obj(self): + # Note: _selected_obj is always just `self.obj` for SeriesGroupBy if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: @@ -554,14 +570,14 @@ def _set_result_index_ordered(self, result): def _dir_additions(self): return self.obj._dir_additions() | self._apply_whitelist - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] raise AttributeError( - "%r object has no attribute %r" % (type(self).__name__, attr) + f"'{type(self).__name__}' object has no attribute '{attr}'" ) @Substitution( @@ -634,24 +650,14 @@ def curried(x): # TODO: is the above comment accurate? raise - # related to : GH3688 - # try item-by-item - # this can be called recursively, so need to raise - # ValueError - # if we don't have this method to indicated to aggregate to - # mark this column as an error - try: - result = self._aggregate_item_by_item(name, *args, **kwargs) - assert self.obj.ndim == 2 - return result - except AttributeError: - # e.g. SparseArray has no flags attr - # FIXME: 'SeriesGroupBy' has no attribute '_aggregate_item_by_item' - # occurs in idxmax() case - # in tests.groupby.test_function.test_non_cython_api - assert self.obj.ndim == 1 + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError raise ValueError + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result + wrapper.__name__ = name return wrapper @@ -748,13 +754,13 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: + def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) - def _cumcount_array(self, ascending=True): + def _cumcount_array(self, ascending: bool = True): """ Parameters ---------- @@ -786,7 +792,7 @@ def _cumcount_array(self, ascending=True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _try_cast(self, result, obj, numeric_only=False): + def _try_cast(self, result, obj, numeric_only: bool = False): """ Try to cast the result to our obj original type, we may have roundtripped through object in the mean-time. @@ -801,38 +807,21 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results _may_ have been generated in UTC. - # Ensure we localize to UTC first before converting - # to the target timezone - arr = extract_array(obj) - try: - result = arr._from_sequence(result, dtype="datetime64[ns, UTC]") - result = result.astype(dtype) - except TypeError: - # _try_cast was called at a point where the result - # was already tz-aware - pass - elif is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check - # if the type is compatible with the calling EA. + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. # return the same type (Series) as our caller - try: - result = obj._values._from_sequence(result, dtype=dtype) - except Exception: - # https://github.com/pandas-dev/pandas/issues/22850 - # pandas has no control over what 3rd-party ExtensionArrays - # do in _values_from_sequence. We still want ops to work - # though, so we catch any regular Exception. - pass + cls = dtype.construct_array_type() + result = try_cast_to_ea(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) return result - def _transform_should_cast(self, func_nm): + def _transform_should_cast(self, func_nm: str) -> bool: """ Parameters ---------- @@ -848,64 +837,105 @@ def _transform_should_cast(self, func_nm): func_nm not in base.cython_cast_blacklist ) - def _cython_transform(self, how, numeric_only=True, **kwargs): - output = collections.OrderedDict() - for name, obj in self._iterate_slices(): + def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + output: Dict[base.OutputKey, np.ndarray] = {} + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue try: - result, names = self.grouper.transform(obj.values, how, **kwargs) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + if self._transform_should_cast(how): - output[name] = self._try_cast(result, obj) - else: - output[name] = result + result = self._try_cast(result, obj) + + key = base.OutputKey(label=name, position=idx) + output[key] = result if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_transformed_output(output, names) + return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output, names=None): + def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_transformed_output(self, output, names=None): + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): - output = {} - for name, obj in self._iterate_slices(): + def _cython_agg_general( + self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + ): + output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {} + # Ideally we would be able to enumerate self._iterate_slices and use + # the index from enumeration as the key of output, but ohlc in particular + # returns a (n x 4) array. Output requires 1D ndarrays as values, so we + # need to slice that up into 1D arrays + idx = 0 + for obj in self._iterate_slices(): + name = obj.name is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) - output[name] = self._try_cast(result, obj) + result, agg_names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) + + if agg_names: + # e.g. ohlc + assert len(agg_names) == result.shape[1] + for result_column, result_name in zip(result.T, agg_names): + key = base.OutputKey(label=result_name, position=idx) + output[key] = self._try_cast(result_column, obj) + idx += 1 + else: + assert result.ndim == 1 + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj) + idx += 1 if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output, names) + return self._wrap_aggregated_output(output) def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict - output = {} - for name, obj in self._iterate_slices(): + output: Dict[base.OutputKey, np.ndarray] = {} + + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name + if self.grouper.ngroups == 0: + # agg_series below assumes ngroups > 0 + continue + try: - result, counts = self.grouper.agg_series(obj, f) + # if this function is invalid for this dtype, we will ignore it. + func(obj[:0]) except TypeError: continue - else: - output[name] = self._try_cast(result, obj, numeric_only=True) + except AssertionError: + raise + except Exception: + # Our function depends on having a non-empty argument + # See test_groupby_agg_err_catching + pass + + result, counts = self.grouper.agg_series(obj, f) + assert result is not None + key = base.OutputKey(label=name, position=idx) + output[key] = self._try_cast(result, obj, numeric_only=True) if len(output) == 0: return self._python_apply_general(f) @@ -913,18 +943,18 @@ def _python_agg_general(self, func, *args, **kwargs): if self.grouper._filter_empty_groups: mask = counts.ravel() > 0 - for name, result in output.items(): + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[name] = self._try_cast(values[mask], result) + output[key] = self._try_cast(values[mask], result) return self._wrap_aggregated_output(output) - def _concat_objects(self, keys, values, not_indexed_same=False): + def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat def reset_identity(values): @@ -984,10 +1014,7 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - if ( - isinstance(result, Series) - and getattr(self, "_selection_name", None) is not None - ): + if isinstance(result, Series) and self._selection_name is not None: result.name = self._selection_name @@ -1096,9 +1123,8 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return self._get_cythonized_result( "group_any_all", - self.grouper, aggregate=True, - cython_dtype=np.uint8, + cython_dtype=np.dtype(np.uint8), needs_values=True, needs_mask=True, pre_processing=objs_to_bool, @@ -1109,7 +1135,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: @Substitution(name="groupby") @Appender(_common_see_also) - def any(self, skipna=True): + def any(self, skipna: bool = True): """ Return True if any value in the group is truthful, else False. @@ -1126,7 +1152,7 @@ def any(self, skipna=True): @Substitution(name="groupby") @Appender(_common_see_also) - def all(self, skipna=True): + def all(self, skipna: bool = True): """ Return True if all values in the group are truthful, else False. @@ -1221,12 +1247,12 @@ def median(self, **kwargs): return self._cython_agg_general( "median", alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), - **kwargs + **kwargs, ) @Substitution(name="groupby") @Appender(_common_see_also) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values. @@ -1249,7 +1275,7 @@ def std(self, ddof=1, *args, **kwargs): @Substitution(name="groupby") @Appender(_common_see_also) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): """ Compute variance of groups, excluding missing values. @@ -1277,7 +1303,7 @@ def var(self, ddof=1, *args, **kwargs): @Substitution(name="groupby") @Appender(_common_see_also) - def sem(self, ddof=1): + def sem(self, ddof: int = 1): """ Compute standard error of the mean of groups, excluding missing values. @@ -1309,8 +1335,8 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self.obj, "name", None) - return result + result.name = self.obj.name + return self._reindex_output(result, fill_value=0) @classmethod def _add_numeric_operations(cls): @@ -1318,7 +1344,13 @@ def _add_numeric_operations(cls): Add numeric operations to the GroupBy generically. """ - def groupby_function(name, alias, npfunc, numeric_only=True, min_count=-1): + def groupby_function( + name: str, + alias: str, + npfunc, + numeric_only: bool = True, + min_count: int = -1, + ): _local_template = """ Compute %(f)s of group values. @@ -1350,22 +1382,11 @@ def f(self, **kwargs): # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass - elif "decimal does not support skipna=True" in str(err): - # FIXME: kludge for test_decimal:test_in_numeric_groupby - pass else: raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - - # coerce the resulting columns if we can - if isinstance(result, DataFrame): - for col in result.columns: - result[col] = self._try_cast(result[col], self.obj[col]) - else: - result = self._try_cast(result, self.obj) - return result set_function_name(f, name, cls) @@ -1408,7 +1429,7 @@ def last(x): @Substitution(name="groupby") @Appender(_common_see_also) - def ohlc(self): + def ohlc(self) -> DataFrame: """ Compute sum of values, excluding missing values. @@ -1590,9 +1611,8 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result( "group_fillna_indexer", - self.grouper, needs_mask=True, - cython_dtype=np.int64, + cython_dtype=np.dtype(np.int64), result_is_index=True, direction=direction, limit=limit, @@ -1760,6 +1780,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) + out = self._reindex_output(out) return out.sort_index() if self.sort else out # dropna is truthy @@ -1771,7 +1792,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra raise ValueError( "For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " - "(was passed {dropna}).".format(dropna=dropna) + f"(was passed {dropna})." ) # old behaviour, but with all and any support for DataFrames. @@ -1792,9 +1813,9 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra # create a grouper with the original parameters, but on dropped # object - from pandas.core.groupby.grouper import _get_grouper + from pandas.core.groupby.grouper import get_grouper - grouper, _, _ = _get_grouper( + grouper, _, _ = get_grouper( dropped, key=self.keys, axis=self.axis, @@ -1821,7 +1842,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra return result - def quantile(self, q=0.5, interpolation="linear"): + def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -1886,11 +1907,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: if is_scalar(q): return self._get_cythonized_result( "group_quantile", - self.grouper, aggregate=True, needs_values=True, needs_mask=True, - cython_dtype=np.float64, + cython_dtype=np.dtype(np.float64), pre_processing=pre_processor, post_processing=post_processor, q=q, @@ -1900,11 +1920,10 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: results = [ self._get_cythonized_result( "group_quantile", - self.grouper, aggregate=True, needs_values=True, needs_mask=True, - cython_dtype=np.float64, + cython_dtype=np.dtype(np.float64), pre_processing=pre_processor, post_processing=post_processor, q=qi, @@ -1918,25 +1937,26 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = np.roll(list(range(result.index.nlevels)), -1) - result = result.reorder_levels(order) - result = result.reindex(q, level=-1) + order = list(range(1, result.index.nlevels)) + [0] - # fix order. - hi = len(q) * self.ngroups - arr = np.arange(0, hi, self.ngroups) - arrays = [] + # temporarily saves the index names + index_names = np.array(result.index.names) - for i in range(self.ngroups): - arr2 = arr + i - arrays.append(arr2) + # set index names to positions to avoid confusion + result.index.names = np.arange(len(index_names)) - indices = np.concatenate(arrays) - assert len(indices) == len(result) + # place quantiles on the inside + result = result.reorder_levels(order) + + # restore the index names in order + result.index.names = index_names[order] + + # reorder rows to keep things sorted + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() return result.take(indices) @Substitution(name="groupby") - def ngroup(self, ascending=True): + def ngroup(self, ascending: bool = True): """ Number each group from 0 to the number of groups - 1. @@ -2005,7 +2025,7 @@ def ngroup(self, ascending=True): return result @Substitution(name="groupby") - def cumcount(self, ascending=True): + def cumcount(self, ascending: bool = True): """ Number each item in each group from 0 to the length of that group - 1. @@ -2066,7 +2086,12 @@ def cumcount(self, ascending=True): @Substitution(name="groupby") @Appender(_common_see_also) def rank( - self, method="average", ascending=True, na_option="keep", pct=False, axis=0 + self, + method: str = "average", + ascending: bool = True, + na_option: str = "keep", + pct: bool = False, + axis: int = 0, ): """ Provide the rank of values within each group. @@ -2074,17 +2099,17 @@ def rank( Parameters ---------- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. ascending : bool, default True False for ranks by high (1) to low (N). na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. pct : bool, default False Compute percentage rank of data within each group. axis : int, default 0 @@ -2171,17 +2196,16 @@ def cummax(self, axis=0, **kwargs): def _get_cythonized_result( self, - how, - grouper, - aggregate=False, - cython_dtype=None, - needs_values=False, - needs_mask=False, - needs_ngroups=False, - result_is_index=False, + how: str, + cython_dtype: np.dtype, + aggregate: bool = False, + needs_values: bool = False, + needs_mask: bool = False, + needs_ngroups: bool = False, + result_is_index: bool = False, pre_processing=None, post_processing=None, - **kwargs + **kwargs, ): """ Get result for Cythonized functions. @@ -2189,13 +2213,11 @@ def _get_cythonized_result( Parameters ---------- how : str, Cythonized function name to be called - grouper : Grouper object containing pertinent group info + cython_dtype : np.dtype + Type of the array that will be modified by the Cython call. aggregate : bool, default False Whether the result should be aggregated to match the number of groups - cython_dtype : default None - Type of the array that will be modified by the Cython call. If - `None`, the type will be inferred from the values of each slice needs_values : bool, default False Whether the values should be a part of the Cython call signature @@ -2238,11 +2260,14 @@ def _get_cythonized_result( "Cannot use 'pre_processing' without specifying 'needs_values'!" ) + grouper = self.grouper + labels, _, ngroups = grouper.group_info - output = collections.OrderedDict() + output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) - for name, obj in self._iterate_slices(): + for idx, obj in enumerate(self._iterate_slices()): + name = obj.name values = obj._data._values if aggregate: @@ -2250,9 +2275,6 @@ def _get_cythonized_result( else: result_sz = len(values) - if not cython_dtype: - cython_dtype = values.dtype - result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) inferences = None @@ -2278,7 +2300,8 @@ def _get_cythonized_result( if post_processing: result = post_processing(result, inferences) - output[name] = result + key = base.OutputKey(label=name, position=idx) + output[key] = result if aggregate: return self._wrap_aggregated_output(output) @@ -2312,8 +2335,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._get_cythonized_result( "group_shift_indexer", - self.grouper, - cython_dtype=np.int64, + cython_dtype=np.dtype(np.int64), needs_ngroups=True, result_is_index=True, periods=periods, @@ -2340,8 +2362,11 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 axis=axis, ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.labels) + fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 @@ -2355,6 +2380,8 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2368,6 +2395,10 @@ def head(self, n=5): A B 0 1 2 2 5 6 + >>> df.groupby('A').head(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array() < n @@ -2383,6 +2414,8 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2396,12 +2429,18 @@ def tail(self, n=5): A B 1 a 2 3 b 2 + >>> df.groupby('A').tail(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output): + def _reindex_output( + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2415,8 +2454,10 @@ def _reindex_output(self, output): Parameters ---------- - output: Series or DataFrame + output : Series or DataFrame Object resulting from grouping and applying an operation. + fill_value : scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. Returns ------- @@ -2447,7 +2488,11 @@ def _reindex_output(self, output): ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, "copy": False} + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } return output.reindex(**d) # GH 13204 @@ -2469,7 +2514,9 @@ def _reindex_output(self, output): output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) @@ -2482,7 +2529,23 @@ def _reindex_output(self, output): @Appender(GroupBy.__doc__) -def groupby(obj, by, **kwds): +def get_groupby( + obj: NDFrame, + by: Optional[_KeysArgType] = None, + axis: int = 0, + level=None, + grouper: "Optional[ops.BaseGrouper]" = None, + exclusions=None, + selection=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + mutated: bool = False, +) -> GroupBy: + + klass: Type[GroupBy] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy @@ -2492,6 +2555,20 @@ def groupby(obj, by, **kwds): klass = DataFrameGroupBy else: - raise TypeError("invalid type: {}".format(obj)) - - return klass(obj, by, **kwds) + raise TypeError(f"invalid type: {obj}") + + return klass( + obj=obj, + keys=by, + axis=axis, + level=level, + grouper=grouper, + exclusions=exclusions, + selection=selection, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + mutated=mutated, + ) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d7eaaca5ac83a..0b89e702c9867 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -3,18 +3,17 @@ split-apply-combine paradigm. """ -from typing import Tuple -import warnings +from typing import Dict, Hashable, List, Optional, Tuple import numpy as np +from pandas._typing import FrameOrSeries from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( ensure_categorical, is_categorical_dtype, is_datetime64_dtype, - is_hashable, is_list_like, is_scalar, is_timedelta64_dtype, @@ -25,10 +24,9 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame +from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby -from pandas.core.groupby.ops import BaseGrouper -from pandas.core.index import CategoricalIndex, Index, MultiIndex +from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.io.formats.printing import pprint_thing @@ -36,8 +34,7 @@ class Grouper: """ - A Grouper allows the user to specify a groupby instruction for a target - object. + A Grouper allows the user to specify a groupby instruction for an object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target @@ -49,17 +46,18 @@ class Grouper: Parameters ---------- key : str, defaults to None - groupby key, which selects the grouping column of the target + Groupby key, which selects the grouping column of the target. level : name/number, defaults to None - the level for the target index + The level for the target index. freq : str / frequency object, defaults to None This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification of available frequencies, please see `here - `_. - axis : number/name of the axis, defaults to 0 + `_. + axis : str, int, defaults to 0 + Number/name of the axis. sort : bool, default to False - whether to sort the resulting labels + Whether to sort the resulting labels. closed : {'left' or 'right'} Closed end of interval. Only when `freq` parameter is passed. label : {'left' or 'right'} @@ -93,7 +91,7 @@ class Grouper: >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ - _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...] + _attributes: Tuple[str, ...] = ("key", "level", "freq", "axis", "sort") def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: @@ -119,7 +117,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): """ Parameters ---------- @@ -133,7 +131,7 @@ def _get_grouper(self, obj, validate=True): """ self._set_grouper(obj) - self.grouper, exclusions, self.obj = _get_grouper( + self.grouper, _, self.obj = get_grouper( self.obj, [self.key], axis=self.axis, @@ -143,17 +141,18 @@ def _get_grouper(self, obj, validate=True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- - obj : the subject object + obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ + assert obj is not None if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -172,7 +171,7 @@ def _set_grouper(self, obj, sort=False): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: - raise KeyError("The grouper name {0} is not found".format(key)) + raise KeyError(f"The grouper name {key} is not found") ax = Index(obj[key], name=key) else: @@ -188,14 +187,14 @@ def _set_grouper(self, obj, sort=False): else: if level not in (0, ax.name): - raise ValueError("The level {0} is not valid".format(level)) + raise ValueError(f"The level {level} is not valid") # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, is_copy=False) + obj = obj.take(indexer, axis=self.axis) self.obj = obj self.grouper = ax @@ -205,15 +204,15 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups - def __repr__(self): + def __repr__(self) -> str: attrs_list = ( - "{}={!r}".format(attr_name, getattr(self, attr_name)) + f"{attr_name}={repr(getattr(self, attr_name))}" for attr_name in self._attributes if getattr(self, attr_name) is not None ) attrs = ", ".join(attrs_list) - cls_name = self.__class__.__name__ - return "{}({})".format(cls_name, attrs) + cls_name = type(self).__name__ + return f"{cls_name}({attrs})" class Grouping: @@ -224,10 +223,10 @@ class Grouping: ---------- index : Index grouper : - obj : + obj Union[DataFrame, Series]: name : level : - observed : boolean, default False + observed : bool, default False If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -236,25 +235,22 @@ class Grouping: ------- **Attributes**: * indices : dict of {group -> index_list} - * labels : ndarray, group labels - * ids : mapping of label -> group - * counts : array of group counts + * codes : ndarray, group codes * group_index : unique groups * groups : dict of {group -> label_list} """ def __init__( self, - index, + index: Index, grouper=None, - obj=None, + obj: Optional[FrameOrSeries] = None, name=None, level=None, - sort=True, - observed=False, - in_axis=False, + sort: bool = True, + observed: bool = False, + in_axis: bool = False, ): - self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) @@ -278,18 +274,20 @@ def __init__( if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError("Level {} not in index".format(level)) + raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = index._get_grouper_for_level( # noqa: E501 - self.grouper, level - ) + ( + self.grouper, + self._codes, + self._group_index, + ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way - # as single grouper groupby, use the group_info to get labels + # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to @@ -302,7 +300,7 @@ def __init__( self.grouper = grouper._get_grouper() else: - if self.grouper is None and self.name is not None: + if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): @@ -318,7 +316,7 @@ def __init__( # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes - self._labels = self.grouper.codes + self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] @@ -344,15 +342,16 @@ def __init__( ): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError("Grouper for '{}' not 1-dimensional".format(t)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index) ): + grper = pprint_thing(self.grouper) errmsg = ( "Grouper result violates len(labels) == " - "len(data)\nresult: %s" % pprint_thing(self.grouper) + f"len(data)\nresult: {grper}" ) self.grouper = None # Try for sanity raise AssertionError(errmsg) @@ -366,75 +365,76 @@ def __init__( self.grouper = self.grouper.astype("timedelta64[ns]") - def __repr__(self): - return "Grouping({0})".format(self.name) + def __repr__(self) -> str: + return f"Grouping({self.name})" def __iter__(self): return iter(self.indices) - _labels = None - _group_index = None + _codes: Optional[np.ndarray] = None + _group_index: Optional[Index] = None @property - def ngroups(self): + def ngroups(self) -> int: return len(self.group_index) @cache_readonly def indices(self): # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): + if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices values = ensure_categorical(self.grouper) return values._reverse_indexer() @property - def labels(self): - if self._labels is None: - self._make_labels() - return self._labels + def codes(self) -> np.ndarray: + if self._codes is None: + self._make_codes() + return self._codes @cache_readonly - def result_index(self): + def result_index(self) -> Index: if self.all_grouper is not None: return recode_from_groupby(self.all_grouper, self.sort, self.group_index) return self.group_index @property - def group_index(self): + def group_index(self) -> Index: if self._group_index is None: - self._make_labels() + self._make_codes() + assert self._group_index is not None return self._group_index - def _make_labels(self): - if self._labels is None or self._group_index is None: + def _make_codes(self) -> None: + if self._codes is None or self._group_index is None: # we have a list of groupers - if isinstance(self.grouper, BaseGrouper): - labels = self.grouper.label_info + if isinstance(self.grouper, ops.BaseGrouper): + codes = self.grouper.codes_info uniques = self.grouper.result_index else: - labels, uniques = algorithms.factorize(self.grouper, sort=self.sort) + codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) - self._labels = labels + self._codes = codes self._group_index = uniques @cache_readonly - def groups(self): - return self.index.groupby(Categorical.from_codes(self.labels, self.group_index)) + def groups(self) -> Dict[Hashable, np.ndarray]: + return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) -def _get_grouper( - obj: NDFrame, +def get_grouper( + obj: FrameOrSeries, key=None, - axis=0, + axis: int = 0, level=None, - sort=True, - observed=False, - mutated=False, - validate=True, -): + sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, +) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ - create and return a BaseGrouper, which is an internal + Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers @@ -450,9 +450,9 @@ def _get_grouper( a BaseGrouper. If observed & we have a categorical grouper, only show the observed - values + values. - If validate, then check for key/level overlaps + If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) @@ -491,9 +491,10 @@ def _get_grouper( raise ValueError("multiple levels only valid with MultiIndex") if isinstance(level, str): - if obj.index.name != level: + if obj._get_axis(axis).name != level: raise ValueError( - "level name {} is not the name of the index".format(level) + f"level name {level} is not the name " + f"of the {obj._get_axis_name(axis)}" ) elif level > 0 or level < -1: raise ValueError("level > 0 or level < -1 only valid with MultiIndex") @@ -509,36 +510,12 @@ def _get_grouper( if key.key is None: return grouper, [], obj else: - return grouper, {key.key}, obj + return grouper, [key.key], obj # already have a BaseGrouper, just return it - elif isinstance(key, BaseGrouper): + elif isinstance(key, ops.BaseGrouper): return key, [], obj - # In the future, a tuple key will always mean an actual key, - # not an iterable of keys. In the meantime, we attempt to provide - # a warning. We can assume that the user wanted a list of keys when - # the key is not in the index. We just have to be careful with - # unhashable elements of `key`. Any unhashable elements implies that - # they wanted a list of keys. - # https://github.com/pandas-dev/pandas/issues/18314 - is_tuple = isinstance(key, tuple) - all_hashable = is_tuple and is_hashable(key) - - if is_tuple: - if ( - all_hashable and key not in obj and set(key).issubset(obj) - ) or not all_hashable: - # column names ('a', 'b') -> ['a', 'b'] - # arrays like (a, b) -> [a, b] - msg = ( - "Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by=[...]' instead of 'by=(...)'. In " - "the future, a tuple will always mean a single key." - ) - warnings.warn(msg, FutureWarning, stacklevel=5) - key = list(key) - if not isinstance(key, list): keys = [key] match_axis_length = False @@ -565,7 +542,8 @@ def _get_grouper( all_in_columns_index = all( g in obj.columns or g in obj.index.names for g in keys ) - elif isinstance(obj, Series): + else: + assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: @@ -578,11 +556,11 @@ def _get_grouper( else: levels = [level] * len(keys) - groupings = [] - exclusions = [] + groupings: List[Grouping] = [] + exclusions: List[Hashable] = [] # if the actual grouper should be obj[key] - def is_in_axis(key): + def is_in_axis(key) -> bool: if not _is_label_like(key): items = obj._data.items try: @@ -594,7 +572,7 @@ def is_in_axis(key): return True # if the grouper is obj[name] - def is_in_obj(gpr): + def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: @@ -627,12 +605,8 @@ def is_in_obj(gpr): if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - ( - "Length of grouper ({len_gpr}) and axis ({len_axis})" - " must be same length".format( - len_gpr=len(gpr), len_axis=obj.shape[axis] - ) - ) + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " + "must be same length" ) # create the Grouping @@ -660,15 +634,15 @@ def is_in_obj(gpr): groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj -def _is_label_like(val): +def _is_label_like(val) -> bool: return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) -def _convert_grouper(axis, grouper): +def _convert_grouper(axis: Index, grouper): if isinstance(grouper, dict): return grouper.get elif isinstance(grouper, Series): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2a7fd079679a4..37067a1897a52 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,12 +7,14 @@ """ import collections +from typing import List, Optional, Sequence, Tuple, Type import numpy as np from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction +from pandas._typing import FrameOrSeries from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -40,8 +42,8 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base -from pandas.core.index import Index, MultiIndex, ensure_index +from pandas.core.groupby import base, grouper +from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -60,15 +62,14 @@ class BaseGrouper: Parameters ---------- - axis : int - the axis to group - groupings : array of grouping + axis : Index + groupings : Sequence[Grouping] all the grouping instances to handle in this grouper for example for grouper list to groupby, need to pass the list - sort : boolean, default True + sort : bool, default True whether this grouper will give sorted result or not - group_keys : boolean, default True - mutated : boolean, default False + group_keys : bool, default True + mutated : bool, default False indexer : intp array, optional the indexer created by Grouper some groupers (TimeGrouper) will sort its axis and its @@ -77,16 +78,28 @@ class BaseGrouper: """ def __init__( - self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None + self, + axis: Index, + groupings: "Sequence[grouper.Grouping]", + sort: bool = True, + group_keys: bool = True, + mutated: bool = False, + indexer: Optional[np.ndarray] = None, ): + assert isinstance(axis, Index), axis + self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = groupings + self._groupings: List[grouper.Grouping] = list(groupings) self.sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + @property + def groupings(self) -> List["grouper.Grouping"]: + return self._groupings + @property def shape(self): return tuple(ping.ngroups for ping in self.groupings) @@ -95,10 +108,10 @@ def __iter__(self): return iter(self.indices) @property - def nkeys(self): + def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data, axis=0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -112,7 +125,7 @@ def get_iterator(self, data, axis=0): for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data, axis=0): + def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -132,46 +145,44 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels) + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - def apply(self, f, data, axis=0): + def apply(self, f, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None - # oh boy - f_name = com.get_callable_name(f) - if ( - f_name not in base.plotting_methods - and hasattr(splitter, "fast_apply") + sdata: FrameOrSeries = splitter._get_sorted_data() + if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): + # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 + # if we pass EA instead of ndarray + # TODO: can we have a workaround for EAs backed by ndarray? + pass + + elif ( + com.get_callable_name(f) not in base.plotting_methods + and isinstance(splitter, FrameSplitter) and axis == 0 - # with MultiIndex, apply_frame_axis0 would raise InvalidApply - # TODO: can we make this check prettier? - and not splitter._get_sorted_data().index._has_complex_internals + # apply_frame_axis0 doesn't allow MultiIndex + and not isinstance(sdata.index, MultiIndex) ): try: result_values, mutated = splitter.fast_apply(f, group_keys) - # If the fast apply path could be used we can return here. - # Otherwise we need to fall back to the slow implementation. - if len(result_values) == len(group_keys): - return group_keys, result_values, mutated - except libreduction.InvalidApply as err: - # Cannot fast apply on MultiIndex (_has_complex_internals). - # This Exception is also raised if `f` triggers an exception + # This Exception is raised if `f` triggers an exception # but it is preferable to raise the exception in Python. if "Let this error raise above us" not in str(err): # TODO: can we infer anything about whether this is # worth-retrying in pure-python? raise - except TypeError as err: - if "Cannot convert" in str(err): - # via apply_frame_axis0 if we pass a non-ndarray - pass - else: - raise + + else: + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) @@ -190,7 +201,7 @@ def apply(self, f, data, axis=0): continue # group might be modified - group_axes = _get_axes(group) + group_axes = group.axes res = f(group) if not _is_indexed_like(res, group_axes): mutated = True @@ -204,26 +215,25 @@ def indices(self): if len(self.groupings) == 1: return self.groupings[0].indices else: - label_list = [ping.labels for ping in self.groupings] + codes_list = [ping.codes for ping in self.groupings] keys = [com.values_from_object(ping.group_index) for ping in self.groupings] - return get_indexer_dict(label_list, keys) + return get_indexer_dict(codes_list, keys) @property - def labels(self): - return [ping.labels for ping in self.groupings] + def codes(self) -> List[np.ndarray]: + return [ping.codes for ping in self.groupings] @property - def levels(self): + def levels(self) -> List[Index]: return [ping.group_index for ping in self.groupings] @property def names(self): return [ping.name for ping in self.groupings] - def size(self): + def size(self) -> Series: """ - Compute group sizes - + Compute group sizes. """ ids, _, ngroup = self.group_info ids = ensure_platform_int(ids) @@ -244,52 +254,52 @@ def groups(self): return self.axis.groupby(to_groupby) @cache_readonly - def is_monotonic(self): + def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic @cache_readonly def group_info(self): - comp_ids, obs_group_ids = self._get_compressed_labels() + comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups @cache_readonly - def label_info(self): - # return the labels of items in original grouped axis - labels, _, _ = self.group_info + def codes_info(self) -> np.ndarray: + # return the codes of items in original grouped axis + codes, _, _ = self.group_info if self.indexer is not None: - sorter = np.lexsort((labels, self.indexer)) - labels = labels[sorter] - return labels - - def _get_compressed_labels(self): - all_labels = [ping.labels for ping in self.groupings] - if len(all_labels) > 1: - group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) + sorter = np.lexsort((codes, self.indexer)) + codes = codes[sorter] + return codes + + def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: + all_codes = self.codes + if len(all_codes) > 1: + group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] - return ping.labels, np.arange(len(ping.group_index)) + return ping.codes, np.arange(len(ping.group_index)) @cache_readonly - def ngroups(self): + def ngroups(self) -> int: return len(self.result_index) @property - def recons_labels(self): + def reconstructed_codes(self) -> List[np.ndarray]: + codes = self.codes comp_ids, obs_ids, _ = self.group_info - labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly - def result_index(self): + def result_index(self) -> Index: if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) - codes = self.recons_labels + codes = self.reconstructed_codes levels = [ping.result_index for ping in self.groupings] result = MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names @@ -301,9 +311,9 @@ def get_group_levels(self): return [self.groupings[0].result_index] name_list = [] - for ping, labels in zip(self.groupings, self.recons_labels): - labels = ensure_platform_int(labels) - levels = ping.result_index.take(labels) + for ping, codes in zip(self.groupings, self.reconstructed_codes): + codes = ensure_platform_int(codes) + levels = ping.result_index.take(codes) name_list.append(levels) @@ -336,7 +346,7 @@ def get_group_levels(self): _cython_arity = {"ohlc": 4} # OHLC - _name_functions = {"ohlc": lambda *args: ["open", "high", "low", "close"]} + _name_functions = {"ohlc": ["open", "high", "low", "close"]} def _is_builtin_func(self, arg): """ @@ -345,51 +355,94 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) - def _get_cython_function(self, kind, how, values, is_numeric): + def _get_cython_function(self, kind: str, how: str, values, is_numeric: bool): dtype_str = values.dtype.name + ftype = self._cython_functions[kind][how] - def get_func(fname): - # see if there is a fused-type version of function - # only valid for numeric - f = getattr(libgroupby, fname, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, "object"]: - f2 = getattr( - libgroupby, - "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), - None, - ) - if f2 is not None: - return f2 - - if hasattr(f, "__signatures__"): - # inspect what fused types are implemented - if dtype_str == "object" and "object" not in f.__signatures__: - # return None so we get a NotImplementedError below - # instead of a TypeError at runtime - return None + # see if there is a fused-type version of function + # only valid for numeric + f = getattr(libgroupby, ftype, None) + if f is not None and is_numeric: return f - ftype = self._cython_functions[kind][how] + # otherwise find dtype-specific version, falling back to object + for dt in [dtype_str, "object"]: + f2 = getattr(libgroupby, f"{ftype}_{dt}", None) + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # disallow this function so we get a NotImplementedError below + # instead of a TypeError at runtime + f = None - func = get_func(ftype) + func = f if func is None: raise NotImplementedError( - "function is not implemented for this dtype: " - "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str) + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" ) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): + def _get_cython_func_and_vals( + self, kind: str, how: str, values: np.ndarray, is_numeric: bool + ): + """ + Find the appropriate cython function, casting if necessary. + + Parameters + ---------- + kind : sttr + how : srt + values : np.ndarray + is_numeric : bool + + Returns + ------- + func : callable + values : np.ndarray + """ + try: + func = self._get_cython_function(kind, how, values, is_numeric) + except NotImplementedError: + if is_numeric: + try: + values = ensure_float64(values) + except TypeError: + if lib.infer_dtype(values, skipna=False) == "complex": + values = values.astype(complex) + else: + raise + func = self._get_cython_function(kind, how, values, is_numeric) + else: + raise + return func, values + + def _cython_operation( + self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). + """ + assert kind in ["transform", "aggregate"] orig_values = values + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + elif values.ndim == 2: + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 1, axis + # can we do this operation with our cython functions # if not raise NotImplementedError @@ -399,16 +452,16 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): - raise NotImplementedError("{} dtype not supported".format(values.dtype)) + raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( - "datetime64 type does not support {} operations".format(how) + f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( - "timedelta64 type does not support {} operations".format(how) + f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): @@ -455,32 +508,17 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): ) out_shape = (self.ngroups,) + values.shape[1:] - try: - func = self._get_cython_function(kind, how, values, is_numeric) - except NotImplementedError: - if is_numeric: - try: - values = ensure_float64(values) - except TypeError: - if lib.infer_dtype(values, skipna=False) == "complex": - values = values.astype(complex) - else: - raise - func = self._get_cython_function(kind, how, values, is_numeric) - else: - raise + func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: - out_dtype = "{kind}{itemsize}".format( - kind=values.dtype.kind, itemsize=values.dtype.itemsize - ) + out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" - labels, _, _ = self.group_info + codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( @@ -488,7 +526,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_datetimelike, min_count + result, counts, values, codes, func, is_datetimelike, min_count ) elif kind == "transform": result = _maybe_fill( @@ -497,7 +535,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): # TODO: min_count result = self._transform( - result, values, labels, func, is_datetimelike, **kwargs + result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: @@ -513,11 +551,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): if vdim == 1 and arity == 1: result = result[:, 0] - if how in self._name_functions: - # TODO - names = self._name_functions[how]() - else: - names = None + names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) @@ -529,21 +563,27 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): return result, names - def aggregate(self, values, how, axis=0, min_count=-1): + def aggregate( + self, values, how: str, axis: int = 0, min_count: int = -1 + ) -> Tuple[np.ndarray, Optional[List[str]]]: return self._cython_operation( "aggregate", values, how, axis, min_count=min_count ) - def transform(self, values, how, axis=0, **kwargs): + def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, result, counts, values, comp_ids, agg_func, is_datetimelike, min_count=-1 + self, + result, + counts, + values, + comp_ids, + agg_func, + is_datetimelike: bool, + min_count: int = -1, ): - if values.ndim > 2: - # punting for now - raise NotImplementedError("number of dimensions is currently limited to 2") - elif agg_func is libgroupby.group_nth: + if agg_func is libgroupby.group_nth: # different signature from the others # TODO: should we be using min_count instead of hard-coding it? agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) @@ -553,51 +593,51 @@ def _aggregate( return result def _transform( - self, result, values, comp_ids, transform_func, is_datetimelike, **kwargs + self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): comp_ids, _, ngroups = self.group_info - if values.ndim > 2: - # punting for now - raise NotImplementedError("number of dimensions is currently limited to 2") - else: - transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) + transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) return result - def agg_series(self, obj, func): - if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + + if len(obj) == 0: + # SeriesGrouper would raise if we were to call _aggregate_series_fast + return self._aggregate_series_pure_python(obj, func) + + elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) - elif obj.index._has_complex_internals: + elif isinstance(obj.index, MultiIndex): # MultiIndex; Pre-empt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "No result." in str(err): - # raised in libreduction - pass - elif "Function does not reduce" in str(err): + if "Function does not reduce" in str(err): # raised in libreduction pass else: raise return self._aggregate_series_pure_python(obj, func) - def _aggregate_series_fast(self, obj, func): + def _aggregate_series_fast(self, obj: Series, func): + # At this point we have already checked that + # - obj.index is not a MultiIndex + # - obj is backed by an ndarray, not ExtensionArray + # - len(obj) > 0 + # - ngroups != 0 func = self._is_builtin_func(func) - # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA - # for EAs backed by ndarray we may have a performant workaround - if obj.index._has_complex_internals: - raise TypeError("Incompatible index for Cython grouper") - group_index, _, ngroups = self.group_info # avoids object / Series creation overhead @@ -609,27 +649,34 @@ def _aggregate_series_fast(self, obj, func): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj, func): + def _aggregate_series_pure_python(self, obj: Series, func): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) result = None - splitter = get_splitter(obj, group_index, ngroups, axis=self.axis) + splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): - raise ValueError("Function does not reduce") + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + res = res.item() + else: + raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] result[label] = res + assert result is not None result = lib.maybe_convert_objects(result, try_float=0) # TODO: try_cast back to EA? + return result, counts @@ -664,7 +711,12 @@ class BinGrouper(BaseGrouper): """ def __init__( - self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + self, + bins, + binlabels, + filter_empty: bool = False, + mutated: bool = False, + indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) @@ -672,6 +724,10 @@ def __init__( self.mutated = mutated self.indexer = indexer + # These lengths must match, otherwise we could call agg_series + # with empty self.bins, which would raise in libreduction. + assert len(self.binlabels) == len(self.bins) + @cache_readonly def groups(self): """ dict {group name -> group labels} """ @@ -686,7 +742,7 @@ def groups(self): return result @property - def nkeys(self): + def nkeys(self) -> int: return 1 def _get_grouper(self): @@ -698,7 +754,7 @@ def _get_grouper(self): """ return self - def get_iterator(self, data, axis=0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -707,12 +763,8 @@ def get_iterator(self, data, axis=0): Generator yielding sequence of (name, subsetted object) for each group """ - if isinstance(data, NDFrame): - slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) - length = len(data.axes[axis]) - else: - slicer = lambda start, edge: data[slice(start, edge)] - length = len(data) + slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) + length = len(data.axes[axis]) start = 0 for edge, label in zip(self.bins, self.binlabels): @@ -753,6 +805,11 @@ def group_info(self): ngroups, ) + @cache_readonly + def reconstructed_codes(self) -> List[np.ndarray]: + # get unique result indices, and prepend 0 as groupby starts from the first + return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] + @cache_readonly def result_index(self): if len(self.binlabels) != 0 and isna(self.binlabels[0]): @@ -769,28 +826,27 @@ def names(self): return [self.binlabels.name] @property - def groupings(self): - from pandas.core.groupby.grouper import Grouping - + def groupings(self) -> "List[grouper.Grouping]": return [ - Grouping(lvl, lvl, in_axis=False, level=None, name=name) + grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) ] - def agg_series(self, obj, func): + def agg_series(self, obj: Series, func): + # Caller is responsible for checking ngroups != 0 + assert self.ngroups != 0 + assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result + + if is_extension_array_dtype(obj.dtype): + # pre-empt SeriesBinGrouper from raising TypeError + return self._aggregate_series_pure_python(obj, func) + dummy = obj[:0] grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() -def _get_axes(group): - if isinstance(group, Series): - return [group.index] - else: - return group.axes - - -def _is_indexed_like(obj, axes): +def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False @@ -806,12 +862,13 @@ def _is_indexed_like(obj, axes): class DataSplitter: - def __init__(self, data, labels, ngroups, axis=0): + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups self.axis = axis + assert isinstance(axis, int), axis @cache_readonly def slabels(self): @@ -834,26 +891,17 @@ def __iter__(self): starts, ends = lib.generate_slices(self.slabels, self.ngroups) for i, (start, end) in enumerate(zip(starts, ends)): - # Since I'm now compressing the group ids, it's now not "possible" - # to produce empty slices because such groups would not be observed - # in the data - # if start >= end: - # raise AssertionError('Start %s must be less than end %s' - # % (str(start), str(end))) yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self): + def _get_sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj): - raise AbstractMethodError(self) - - def apply(self, f): + def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: return sdata._get_values(slice_obj) @@ -865,17 +913,18 @@ def fast_apply(self, f, names): sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata, slice_obj): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: if self.axis == 0: return sdata.iloc[slice_obj] else: return sdata._slice(slice_obj, axis=1) -def get_splitter(data, *args, **kwargs): +def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): - klass = SeriesSplitter - elif isinstance(data, DataFrame): + klass: Type[DataSplitter] = SeriesSplitter + else: + # i.e. DataFrame klass = FrameSplitter return klass(data, *args, **kwargs) diff --git a/pandas/core/index.py b/pandas/core/index.py index d308ac1a9b1c7..a9c8e6731a17e 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,3 +1,5 @@ +import warnings + from pandas.core.indexes.api import ( # noqa:F401 CategoricalIndex, DatetimeIndex, @@ -13,13 +15,16 @@ RangeIndex, TimedeltaIndex, UInt64Index, - _all_indexes_same, - _get_combined_index, - _get_consensus_names, - _get_objs_combined_axis, _new_Index, - _union_indexes, ensure_index, ensure_index_from_sequences, + get_objs_combined_axis, ) from pandas.core.indexes.multi import _sparsify # noqa:F401 + +# GH#30193 +warnings.warn( + "pandas.core.index is deprecated and will be removed in a future version. " + "The public classes are available in the top-level namespace.", + FutureWarning, +) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 433bca940c028..4d45769d2fea9 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,6 +3,8 @@ """ import numpy as np +from pandas._typing import AnyArrayLike + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -27,8 +29,13 @@ def is_list_like_indexer(key) -> bool: def is_scalar_indexer(indexer, arr_value) -> bool: - # return True if we are all scalar indexers + """ + Return True if we are all scalar indexers. + Returns + ------- + bool + """ if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) @@ -73,11 +80,11 @@ def check_setitem_lengths(indexer, value, values) -> None: Parameters ---------- indexer : sequence - The key for the setitem + Key for the setitem. value : array-like - The value for the setitem + Value for the setitem. values : array-like - The values being set into + Values being set into. Returns ------- @@ -86,8 +93,7 @@ def check_setitem_lengths(indexer, value, values) -> None: Raises ------ ValueError - When the indexer is an ndarray or list and the lengths don't - match. + When the indexer is an ndarray or list and the lengths don't match. """ # boolean with truth values == len of the value is ok too if isinstance(indexer, (np.ndarray, list)): @@ -122,7 +128,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: ---------- indices : ndarray n : int - length of the array being indexed + Length of the array being indexed. Raises ------ @@ -144,9 +150,7 @@ def validate_indices(indices: np.ndarray, n: int) -> None: if len(indices): min_idx = indices.min() if min_idx < -1: - msg = "'indices' contains values less than allowed ({} < {})".format( - min_idx, -1 - ) + msg = f"'indices' contains values less than allowed ({min_idx} < -1)" raise ValueError(msg) max_idx = indices.max() @@ -168,27 +172,27 @@ def maybe_convert_indices(indices, n: int): Parameters ---------- indices : array-like - The array of indices that we are to convert. + Array of indices that we are to convert. n : int - The number of elements in the array that we are indexing. + Number of elements in the array that we are indexing. Returns ------- - valid_indices : array-like + array-like An array-like of positive indices that correspond to the ones that were passed in initially to this function. Raises ------ - IndexError : one of the converted indices either exceeded the number - of elements (specified by `n`) OR was still negative. + IndexError + One of the converted indices either exceeded the number of, + elements (specified by `n`), or was still negative. """ - if isinstance(indices, list): indices = np.array(indices) if len(indices) == 0: - # If list is empty, np.array will return float and cause indexing - # errors. + # If `indices` is empty, np.array will return a float, + # and will cause indexing errors. return np.empty(0, dtype=np.intp) mask = indices < 0 @@ -208,7 +212,11 @@ def maybe_convert_indices(indices, n: int): def length_of_indexer(indexer, target=None) -> int: """ - return the length of a single non-tuple indexer which could be a slice + Return the length of a single non-tuple indexer which could be a slice. + + Returns + ------- + int """ if target is not None and isinstance(indexer, slice): target_len = len(target) @@ -234,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int: elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") + + +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check if `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length, and the + dtype is validated. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + + See Also + -------- + api.types.is_bool_dtype : Check if `key` is of boolean dtype. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.array([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index cc8ecc0e64684..db774a03c02f8 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -16,7 +16,6 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.accessor import PandasDelegate, delegate_names -from pandas.core.algorithms import take_1d from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex @@ -27,8 +26,7 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): def __init__(self, data, orig): if not isinstance(data, ABCSeries): raise TypeError( - "cannot convert an object of type {0} to a " - "datetimelike index".format(type(data)) + f"cannot convert an object of type {type(data)} to a datetimelike index" ) self._parent = data @@ -55,8 +53,7 @@ def _get_values(self): return DatetimeIndex(data, copy=False, name=self.name) raise TypeError( - "cannot convert an object of type {0} to a " - "datetimelike index".format(type(data)) + f"cannot convert an object of type {type(data)} to a datetimelike index" ) def _delegate_property_get(self, name): @@ -75,9 +72,7 @@ def _delegate_property_get(self, name): result = np.asarray(result) - # blow up if we operate on categories if self.orig is not None: - result = take_1d(result, self.orig.cat.codes) index = self.orig.index else: index = self._parent.index @@ -95,9 +90,8 @@ def _delegate_property_get(self, name): def _delegate_property_set(self, name, value, *args, **kwargs): raise ValueError( - "modifications to a property of a datetimelike " - "object are not supported. Change values on the " - "original." + "modifications to a property of a datetimelike object are not supported. " + "Change values on the original." ) def _delegate_method(self, name, *args, **kwargs): @@ -226,7 +220,7 @@ def to_pytimedelta(self): Returns ------- - a : numpy.ndarray + numpy.ndarray Array of 1D containing data with `datetime.timedelta` type. See Also @@ -318,13 +312,17 @@ def __new__(cls, data): if not isinstance(data, ABCSeries): raise TypeError( - "cannot convert an object of type {0} to a " - "datetimelike index".format(type(data)) + f"cannot convert an object of type {type(data)} to a datetimelike index" ) orig = data if is_categorical_dtype(data) else None if orig is not None: - data = Series(orig.values.categories, name=orig.name, copy=False) + data = Series( + orig.array, + name=orig.name, + copy=False, + dtype=orig.values.categories.dtype, + ) if is_datetime64_dtype(data.dtype): return DatetimeProperties(data, orig) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 86d55ce2e7cc3..4072d06b9427c 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,28 +1,28 @@ import textwrap -import warnings +from typing import List, Set from pandas._libs import NaT, lib import pandas.core.common as com from pandas.core.indexes.base import ( Index, + InvalidIndexError, _new_Index, ensure_index, ensure_index_from_sequences, ) -from pandas.core.indexes.base import InvalidIndexError # noqa:F401 -from pandas.core.indexes.category import CategoricalIndex # noqa:F401 +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.interval import IntervalIndex # noqa:F401 -from pandas.core.indexes.multi import MultiIndex # noqa:F401 -from pandas.core.indexes.numeric import ( # noqa:F401 +from pandas.core.indexes.interval import IntervalIndex +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.numeric import ( Float64Index, Int64Index, NumericIndex, UInt64Index, ) from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.range import RangeIndex # noqa:F401 +from pandas.core.indexes.range import RangeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex _sort_msg = textwrap.dedent( @@ -37,8 +37,6 @@ ) -# TODO: there are many places that rely on these private methods existing in -# pandas.core.index __all__ = [ "Index", "MultiIndex", @@ -57,15 +55,16 @@ "NaT", "ensure_index", "ensure_index_from_sequences", - "_get_combined_index", - "_get_objs_combined_axis", - "_union_indexes", - "_get_consensus_names", - "_all_indexes_same", + "get_objs_combined_axis", + "union_indexes", + "get_consensus_names", + "all_indexes_same", ] -def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): +def get_objs_combined_axis( + objs, intersect: bool = False, axis=0, sort: bool = True +) -> Index: """ Extract combined index: return intersection or union (depending on the value of "intersect") of indexes on given axis, or None if all objects @@ -73,9 +72,8 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): Parameters ---------- - objs : list of objects - Each object will only be considered if it has a _get_axis - attribute. + objs : list + Series or DataFrame objects, may be mix of the two. intersect : bool, default False If True, calculate the intersection between indexes. Otherwise, calculate the union. @@ -88,26 +86,27 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): ------- Index """ - obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")] - if obs_idxes: - return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) + obs_idxes = [obj._get_axis(axis) for obj in objs] + return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) -def _get_distinct_objs(objs): +def _get_distinct_objs(objs: List[Index]) -> List[Index]: """ Return a list with distinct elements of "objs" (different ids). Preserves order. """ - ids = set() + ids: Set[int] = set() res = [] for obj in objs: - if not id(obj) in ids: + if id(obj) not in ids: ids.add(id(obj)) res.append(obj) return res -def _get_combined_index(indexes, intersect=False, sort=False): +def _get_combined_index( + indexes: List[Index], intersect: bool = False, sort: bool = False +) -> Index: """ Return the union or intersection of indexes. @@ -125,7 +124,6 @@ def _get_combined_index(indexes, intersect=False, sort=False): ------- Index """ - # TODO: handle index names! indexes = _get_distinct_objs(indexes) if len(indexes) == 0: @@ -137,7 +135,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): for other in indexes[1:]: index = index.intersection(other) else: - index = _union_indexes(indexes, sort=sort) + index = union_indexes(indexes, sort=sort) index = ensure_index(index) if sort: @@ -148,7 +146,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): return index -def _union_indexes(indexes, sort=True): +def union_indexes(indexes, sort=True) -> Index: """ Return the union of indexes. @@ -174,7 +172,7 @@ def _union_indexes(indexes, sort=True): indexes, kind = _sanitize_and_check(indexes) - def _unique_indices(inds): + def _unique_indices(inds) -> Index: """ Convert indexes to lists and concatenate them, removing duplicates. @@ -200,6 +198,7 @@ def conv(i): result = indexes[0] if hasattr(result, "union_many"): + # DatetimeIndex return result.union_many(indexes[1:]) else: for other in indexes[1:]: @@ -209,15 +208,9 @@ def conv(i): index = indexes[0] for other in indexes[1:]: if not index.equals(other): - - if sort is None: - # TODO: remove once pd.concat sort default changes - warnings.warn(_sort_msg, FutureWarning, stacklevel=8) - sort = True - return _unique_indices(indexes) - name = _get_consensus_names(indexes)[0] + name = get_consensus_names(indexes)[0] if name != index.name: index = index._shallow_copy(name=name) return index @@ -264,7 +257,7 @@ def _sanitize_and_check(indexes): return indexes, "array" -def _get_consensus_names(indexes): +def get_consensus_names(indexes): """ Give a consensus 'names' to indexes. @@ -280,7 +273,6 @@ def _get_consensus_names(indexes): list A list representing the consensus 'names' found. """ - # find the non-none names, need to tupleify to make # the set hashable, then reverse on return consensus_names = {tuple(i.names) for i in indexes if com.any_not_none(*i.names)} @@ -289,7 +281,7 @@ def _get_consensus_names(indexes): return [None] * indexes[0].nlevels -def _all_indexes_same(indexes): +def all_indexes_same(indexes): """ Determine if all indexes contain the same elements. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 187c7e2f3a7f7..ca929b188dc33 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import FrozenSet, Union +from typing import Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -45,7 +45,6 @@ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, - pandas_dtype, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -54,6 +53,7 @@ ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, + ABCIntervalIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, @@ -68,12 +68,12 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -import pandas.core.sorting as sorting from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -107,6 +107,11 @@ def cmp_method(self, other): if is_object_dtype(self) and isinstance(other, ABCCategorical): left = type(other)(self._values, dtype=other.dtype) return op(left, other) + elif is_object_dtype(self) and isinstance(other, ExtensionArray): + # e.g. PeriodArray + with np.errstate(all="ignore"): + result = op(self.values, other) + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): @@ -120,7 +125,7 @@ def cmp_method(self, other): return result return ops.invalid_comparison(self, other, op) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) @@ -136,7 +141,7 @@ def index_arithmetic_method(self, other): return (Index(result[0]), Index(result[1])) return Index(result) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" # TODO: docstring? return set_function_name(index_arithmetic_method, name, cls) @@ -160,6 +165,12 @@ def _new_Index(cls, d): from pandas.core.indexes.period import _new_PeriodIndex return _new_PeriodIndex(cls, **d) + + if issubclass(cls, ABCMultiIndex): + if "labels" in d and "codes" not in d: + # GH#23752 "labels" kwarg has been replaced with "codes" + d["codes"] = d.pop("labels") + return cls.__new__(cls, **d) @@ -176,11 +187,11 @@ class Index(IndexOpsMixin, PandasObject): If an actual dtype is provided, we coerce to that dtype if it's safe. Otherwise, an error will be raised. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. name : object - Name to be stored in the index + Name to be stored in the index. tupleize_cols : bool (default: True) - When True, attempt to create a MultiIndex if possible + When True, attempt to create a MultiIndex if possible. See Also -------- @@ -205,11 +216,11 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = ( + _deprecations: FrozenSet[str] = ( PandasObject._deprecations | IndexOpsMixin._deprecations - | frozenset(["asobject", "contains", "dtype_str", "get_values", "set_value"]) - ) # type: FrozenSet[str] + | frozenset(["contains", "set_value"]) + ) # To hand over control to subclasses _join_precedence = 1 @@ -231,9 +242,13 @@ def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) _typ = "index" - _data = None + _data: Union[ExtensionArray, np.ndarray] _id = None - name = None + _name: Optional[Hashable] = None + # MultiIndex.levels previously allowed setting the index name. We + # don't allow this anymore, and raise if it happens rather than + # failing silently. + _no_setting_name: bool = False _comparables = ["name"] _attributes = ["name"] _is_numeric_dtype = False @@ -259,34 +274,16 @@ def _outer_indexer(self, left, right): # Constructors def __new__( - cls, - data=None, - dtype=None, - copy=False, - name=None, - fastpath=None, - tupleize_cols=True, - **kwargs + cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, ) -> "Index": - from .range import RangeIndex + from pandas.core.indexes.range import RangeIndex from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex - from .numeric import Float64Index, Int64Index, UInt64Index - from .interval import IntervalIndex - from .category import CategoricalIndex - - if name is None and hasattr(data, "name"): - name = data.name - - if fastpath is not None: - warnings.warn( - "The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, - stacklevel=2, - ) - if fastpath: - return cls._simple_new(data, name) + from pandas.core.indexes.numeric import Float64Index, Int64Index, UInt64Index + from pandas.core.indexes.interval import IntervalIndex + from pandas.core.indexes.category import CategoricalIndex + + name = maybe_extract_name(name, data, cls) if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. @@ -303,11 +300,15 @@ def __new__( return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - elif ( - is_interval_dtype(data) or is_interval_dtype(dtype) - ) and not is_object_dtype(dtype): - closed = kwargs.get("closed", None) - return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + elif is_interval_dtype(data) or is_interval_dtype(dtype): + closed = kwargs.pop("closed", None) + if is_dtype_equal(_o_dtype, dtype): + return IntervalIndex( + data, name=name, copy=copy, closed=closed, **kwargs + ).astype(object) + return IntervalIndex( + data, dtype=dtype, name=name, copy=copy, closed=closed, **kwargs + ) elif ( is_datetime64_any_dtype(data) @@ -321,10 +322,9 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex( - data, copy=False, name=name, **kwargs - ) # type: "Index" - return result.astype(object) + return DatetimeIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) @@ -332,21 +332,25 @@ def __new__( if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy - result = TimedeltaIndex(data, copy=False, name=name, **kwargs) - return result.astype(object) + return TimedeltaIndex(data, copy=False, name=name, **kwargs).astype( + object + ) else: return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) - elif is_period_dtype(data) and not is_object_dtype(dtype): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + elif is_period_dtype(data) or is_period_dtype(dtype): + if is_dtype_equal(_o_dtype, dtype): + return PeriodIndex(data, copy=False, name=name, **kwargs).astype(object) + return PeriodIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): - data = np.asarray(data) if not (dtype is None or is_object_dtype(dtype)): # coerce to the provided dtype ea_cls = dtype.construct_array_type() data = ea_cls._from_sequence(data, dtype=dtype, copy=False) + else: + data = np.asarray(data, dtype=object) # coerce to the object dtype data = data.astype(object) @@ -360,41 +364,8 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - return Float64Index(data, copy=copy, dtype=dtype, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": - pass - else: - data = data.astype(dtype) - else: - data = np.array(data, dtype=dtype, copy=copy) + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): @@ -414,44 +385,18 @@ def __new__( subarr = subarr.copy() if dtype is None: - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "integer": - try: - return cls._try_convert_to_int_index(subarr, copy, name, dtype) - except ValueError: - pass - - return Index(subarr, copy=copy, dtype=object, name=name) - elif inferred in ["floating", "mixed-integer-float", "integer-na"]: - # TODO: Returns IntegerArray for integer-na case in the future - return Float64Index(subarr, copy=copy, name=name) - elif inferred == "interval": - try: - return IntervalIndex(subarr, name=name, copy=copy) - except ValueError: - # GH27172: mixed closed Intervals --> object dtype - pass - elif inferred == "boolean": - # don't support boolean explicitly ATM - pass - elif inferred != "string": - if inferred.startswith("datetime"): - try: - return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except (ValueError, OutOfBoundsDatetime): - # GH 27011 - # If we have mixed timezones, just send it - # down the base constructor - pass - - elif inferred.startswith("timedelta"): - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) - elif inferred == "period": - try: - return PeriodIndex(subarr, name=name, **kwargs) - except IncompatibleFrequency: - pass - return cls._simple_new(subarr, name) + new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) + if new_dtype is not None: + return cls( + new_data, dtype=new_dtype, copy=False, name=name, **kwargs + ) + + if kwargs: + raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + return cls._simple_new(subarr, name, **kwargs) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -466,7 +411,7 @@ def __new__( if data and all(isinstance(e, tuple) for e in data): # we must be all tuples, otherwise don't construct # 10697 - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get("names") @@ -507,7 +452,7 @@ def asi8(self): return None @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): """ We require that we have a dtype compat for the values. If we are passed a non-dtype compat, then coerce using the constructor. @@ -528,9 +473,8 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # data buffers and strides. We don't re-use `_ndarray_values`, since # we actually set this value too. result._index_data = values - result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) + result._name = name + return result._reset_identity() @cache_readonly @@ -608,7 +552,7 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def is_(self, other): + def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. @@ -650,13 +594,13 @@ def _engine(self): # Array-Like Methods # ndarray compat - def __len__(self): + def __len__(self) -> int: """ Return the length of the Index. """ return len(self._data) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The array interface, return my values. """ @@ -667,7 +611,7 @@ def __array_wrap__(self, result, context=None): Gets called after a ufunc. """ result = lib.item_from_zerodim(result) - if is_bool_dtype(result) or lib.is_scalar(result): + if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: return result attrs = self._get_attributes_dict() @@ -680,21 +624,6 @@ def dtype(self): """ return self._data.dtype - @property - def dtype_str(self): - """ - Return the dtype str of the underlying data. - - .. deprecated:: 0.25.0 - """ - warnings.warn( - "`dtype_str` has been deprecated. Call `str` on the " - "dtype attribute instead.", - FutureWarning, - stacklevel=2, - ) - return str(self.dtype) - def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. @@ -753,33 +682,18 @@ def astype(self, dtype, copy=True): return self.copy() if copy else self elif is_categorical_dtype(dtype): - from .category import CategoricalIndex + from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) - elif is_datetime64tz_dtype(dtype): - # TODO(GH-24559): Remove this block, use the following elif. - # avoid FutureWarning from DatetimeIndex constructor. - from pandas import DatetimeIndex - - tz = pandas_dtype(dtype).tz - return DatetimeIndex(np.asarray(self)).tz_localize("UTC").tz_convert(tz) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), dtype=dtype, copy=copy) try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - - return DatetimeIndex( - self.values, name=self.name, dtype=dtype, copy=copy - ) - return Index( - self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype - ) + casted = self.values.astype(dtype, copy=copy) except (TypeError, ValueError): - msg = "Cannot cast {name} to dtype {dtype}" - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + raise TypeError(f"Cannot cast {type(self).__name__} to dtype {dtype}") + return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ "take" @@ -791,13 +705,13 @@ def astype(self, dtype, copy=True): Parameters ---------- indices : list - Indices to be taken + Indices to be taken. axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True fill_value : bool, default None If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError. Returns ------- @@ -824,8 +738,10 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): ) else: if allow_fill and fill_value is not None: - msg = "Unable to fill values because {0} cannot contain NA" - raise ValueError(msg.format(self.__class__.__name__)) + cls_name = type(self).__name__ + raise ValueError( + f"Unable to fill values because {cls_name} cannot contain NA" + ) taken = self.values.take(indices) return self._shallow_copy(taken) @@ -840,11 +756,10 @@ def _assert_take_fillable( # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): - msg = ( + raise ValueError( "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" ) - raise ValueError(msg) taken = algos.take( values, indices, allow_fill=allow_fill, fill_value=na_value ) @@ -947,8 +862,6 @@ def __deepcopy__(self, memo=None): memo, default None Standard signature. Unused """ - if memo is None: - memo = {} return self.copy(deep=True) # -------------------------------------------------------------------- @@ -958,18 +871,18 @@ def __repr__(self): """ Return a string representation for this object. """ - klass = self.__class__.__name__ + klass_name = type(self).__name__ data = self._format_data() attrs = self._format_attrs() space = self._format_space() - - prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs) + attrs_str = [f"{k}={v}" for k, v in attrs] + prepr = f",{space}".join(attrs_str) # no data provided, just attributes if data is None: data = "" - res = "%s(%s%s)" % (klass, data, prepr) + res = f"{klass_name}({data}{prepr})" return res @@ -1077,7 +990,7 @@ def to_native_types(self, slicer=None, **kwargs): 2) quoting : bool or None Whether or not there are quoted values in `self` 3) date_format : str - The format used to represent date-like values + The format used to represent date-like values. Returns ------- @@ -1123,26 +1036,13 @@ def _summary(self, name=None): tail = self[-1] if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() - index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) + index_summary = f", {head} to {tail}" else: index_summary = "" if name is None: name = type(self).__name__ - return "%s: %s entries%s" % (name, len(self), index_summary) - - def summary(self, name=None): - """ - Return a summarized representation. - - .. deprecated:: 0.23.0 - """ - warnings.warn( - "'summary' is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - return self._summary(name) + return f"{name}: {len(self)} entries{index_summary}" # -------------------------------------------------------------------- # Conversion Methods @@ -1261,6 +1161,21 @@ def to_frame(self, index=True, name=None): # -------------------------------------------------------------------- # Name-Centric Methods + @property + def name(self): + return self._name + + @name.setter + def name(self, value): + if self._no_setting_name: + # Used in MultiIndex.levels to avoid silently ignoring name updates. + raise RuntimeError( + "Cannot set name on a level of a MultiIndex. Use " + "'MultiIndex.set_names' instead." + ) + maybe_extract_name(value, None, type(self)) + self._name = value + def _validate_names(self, name=None, names=None, deep=False): """ Handles the quirks of having a singular 'name' parameter for general @@ -1303,16 +1218,14 @@ def _set_names(self, values, level=None): if not is_list_like(values): raise ValueError("Names must be a list-like") if len(values) != 1: - raise ValueError("Length of new names must be 1, got %d" % len(values)) + raise ValueError(f"Length of new names must be 1, got {len(values)}") # GH 20527 # All items in 'name' need to be hashable: for name in values: if not is_hashable(name): - raise TypeError( - "{}.name must be a hashable type".format(self.__class__.__name__) - ) - self.name = values[0] + raise TypeError(f"{type(self).__name__}.name must be a hashable type") + self._name = values[0] names = property(fset=_set_names, fget=_get_names) @@ -1377,8 +1290,7 @@ def set_names(self, names, level=None, inplace=False): raise ValueError("Level must be None for non-MultiIndex") if level is not None and not is_list_like(level) and is_list_like(names): - msg = "Names must be a string when a single level is provided." - raise TypeError(msg) + raise TypeError("Names must be a string when a single level is provided.") if not is_list_like(names) and level is None and self.nlevels > 1: raise TypeError("Must pass list-like as `names`.") @@ -1451,7 +1363,7 @@ def rename(self, name, inplace=False): # Level-Centric Methods @property - def nlevels(self): + def nlevels(self) -> int: """ Number of levels. """ @@ -1474,18 +1386,16 @@ def _validate_index_level(self, level): if isinstance(level, int): if level < 0 and level != -1: raise IndexError( - "Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level,) + "Too many levels: Index has only 1 level, " + f"{level} is not a valid level number" ) elif level > 0: raise IndexError( - "Too many levels: Index has only 1 level, not %d" % (level + 1) + f"Too many levels: Index has only 1 level, not {level + 1}" ) elif level != self.name: raise KeyError( - "Requested level ({}) does not match index name ({})".format( - level, self.name - ) + f"Requested level ({level}) does not match index name ({self.name})" ) def _get_level_number(self, level): @@ -1581,9 +1491,8 @@ def droplevel(self, level=0): return self if len(level) >= self.nlevels: raise ValueError( - "Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels) + f"Cannot remove {len(level)} levels from an index with {self.nlevels} " + "levels: at least one level must be left." ) # The two checks above guarantee that here self is a MultiIndex @@ -1604,10 +1513,10 @@ def droplevel(self, level=0): if mask.any(): result = result.putmask(mask, np.nan) - result.name = new_names[0] + result._name = new_names[0] return result else: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex( levels=new_levels, @@ -1652,7 +1561,7 @@ def _get_grouper_for_level(self, mapper, level=None): # Introspection Methods @property - def is_monotonic(self): + def is_monotonic(self) -> bool: """ Alias for is_monotonic_increasing. """ @@ -1676,7 +1585,7 @@ def is_monotonic_increasing(self): return self._engine.is_monotonic_increasing @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1693,7 +1602,7 @@ def is_monotonic_decreasing(self): return self._engine.is_monotonic_decreasing @property - def _is_strictly_monotonic_increasing(self): + def _is_strictly_monotonic_increasing(self) -> bool: """ Return if the index is strictly monotonic increasing (only increasing) values. @@ -1710,7 +1619,7 @@ def _is_strictly_monotonic_increasing(self): return self.is_unique and self.is_monotonic_increasing @property - def _is_strictly_monotonic_decreasing(self): + def _is_strictly_monotonic_decreasing(self) -> bool: """ Return if the index is strictly monotonic decreasing (only decreasing) values. @@ -1726,36 +1635,33 @@ def _is_strictly_monotonic_decreasing(self): """ return self.is_unique and self.is_monotonic_decreasing - def is_lexsorted_for_tuple(self, tup): - return True - @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return if the index has unique values. """ return self._engine.is_unique @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return not self.is_unique - def is_boolean(self): + def is_boolean(self) -> bool: return self.inferred_type in ["boolean"] - def is_integer(self): + def is_integer(self) -> bool: return self.inferred_type in ["integer"] - def is_floating(self): + def is_floating(self) -> bool: return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] - def is_numeric(self): + def is_numeric(self) -> bool: return self.inferred_type in ["integer", "floating"] - def is_object(self): + def is_object(self) -> bool: return is_object_dtype(self.dtype) - def is_categorical(self): + def is_categorical(self) -> bool: """ Check if the Index holds categorical data. @@ -1791,10 +1697,10 @@ def is_categorical(self): """ return self.inferred_type in ["categorical"] - def is_interval(self): + def is_interval(self) -> bool: return self.inferred_type in ["interval"] - def is_mixed(self): + def is_mixed(self) -> bool: return self.inferred_type in ["mixed"] def holds_integer(self): @@ -1811,7 +1717,7 @@ def inferred_type(self): return lib.infer_dtype(self, skipna=False) @cache_readonly - def is_all_dates(self): + def is_all_dates(self) -> bool: return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- @@ -1820,36 +1726,7 @@ def is_all_dates(self): def __reduce__(self): d = dict(data=self._data) d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None - - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop("data") - for k, v in state.items(): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ + return _new_Index, (type(self), d), None # -------------------------------------------------------------------- # Null Handling Methods @@ -1873,8 +1750,7 @@ def _isnan(self): @cache_readonly def _nan_idxs(self): if self._can_hold_na: - w, = self._isnan.nonzero() - return w + return self._isnan.nonzero()[0] else: return np.array([], dtype=np.int64) @@ -2004,7 +1880,7 @@ def notna(self): downcast : dict, default is None a dict of item->dtype of what to downcast if possible, or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) + equal type (e.g. float64 to int64 if possible). Returns ------- @@ -2041,10 +1917,10 @@ def fillna(self, value=None, downcast=None): @Appender(_index_shared_docs["dropna"]) def dropna(self, how="any"): if how not in ("any", "all"): - raise ValueError("invalid how option: {0}".format(how)) + raise ValueError(f"invalid how option: {how}") if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy(self._values[~self._isnan]) return self._shallow_copy() # -------------------------------------------------------------------- @@ -2059,7 +1935,7 @@ def dropna(self, how="any"): Parameters ---------- level : int or str, optional, default None - Only return values from specified level (for MultiIndex) + Only return values from specified level (for MultiIndex). .. versionadded:: 0.23.0 @@ -2184,68 +2060,6 @@ def duplicated(self, keep="first"): """ return super().duplicated(keep=keep) - def get_duplicates(self): - """ - Extract duplicated index elements. - - .. deprecated:: 0.23.0 - Use idx[idx.duplicated()].unique() instead - - Returns a sorted list of index elements which appear more than once in - the index. - - Returns - ------- - array-like - List of duplicated indexes. - - See Also - -------- - Index.duplicated : Return boolean array denoting duplicates. - Index.drop_duplicates : Return Index with duplicates removed. - - Examples - -------- - - Works on different Index of types. - - >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() # doctest: +SKIP - [2, 3] - - Note that for a DatetimeIndex, it does not return a list but a new - DatetimeIndex: - - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', - ... '2018-01-03', '2018-01-04', '2018-01-04'], - ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() # doctest: +SKIP - DatetimeIndex(['2018-01-03', '2018-01-04'], - dtype='datetime64[ns]', freq=None) - - Sorts duplicated elements even when indexes are unordered. - - >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP - [2, 3] - - Return empty array-like structure when all elements are unique. - - >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP - [] - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], - ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() # doctest: +SKIP - DatetimeIndex([], dtype='datetime64[ns]', freq=None) - """ - warnings.warn( - "'get_duplicates' is deprecated and will be removed in " - "a future release. You can use " - "idx[idx.duplicated()].unique() instead", - FutureWarning, - stacklevel=2, - ) - - return self[self.duplicated()].unique() - def _get_unique_index(self, dropna=False): """ Returns an index containing unique values. @@ -2315,10 +2129,8 @@ def __xor__(self, other): def __nonzero__(self): raise ValueError( - "The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( - self.__class__.__name__ - ) + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." ) __bool__ = __nonzero__ @@ -2381,7 +2193,7 @@ def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError( "The 'sort' keyword only takes the values of " - "None or False; {0} was passed.".format(sort) + f"None or False; {sort} was passed." ) def union(self, other, sort=None): @@ -2471,11 +2283,11 @@ def _union(self, other, sort): return other._get_reconciled_name_object(self) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): + if is_datetime64tz_dtype(self): lvals = self._ndarray_values else: lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): + if is_datetime64tz_dtype(other): rvals = other._ndarray_values else: rvals = other._values @@ -2507,11 +2319,10 @@ def _union(self, other, sort): if sort is None: try: - result = sorting.safe_sort(result) - except TypeError as e: + result = algos.safe_sort(result) + except TypeError as err: warnings.warn( - "{}, sort order is undefined for " - "incomparable objects".format(e), + f"{err}, sort order is undefined for incomparable objects", RuntimeWarning, stacklevel=3, ) @@ -2575,14 +2386,8 @@ def intersection(self, other, sort=False): return this.intersection(other, sort=sort) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + lvals = self._values + rvals = other._values if self.is_monotonic and other.is_monotonic: try: @@ -2601,18 +2406,13 @@ def intersection(self, other, sort=False): indexer = indexer[indexer != -1] taken = other.take(indexer) + res_name = get_op_result_name(self, other) if sort is None: - taken = sorting.safe_sort(taken.values) - if self.name != other.name: - name = None - else: - name = self.name - return self._shallow_copy(taken, name=name) - - if self.name != other.name: - taken.name = None + taken = algos.safe_sort(taken.values) + return self._shallow_copy(taken, name=res_name) + taken.name = res_name return taken def difference(self, other, sort=None): @@ -2673,11 +2473,11 @@ def difference(self, other, sort=None): the_diff = this.values.take(label_diff) if sort is None: try: - the_diff = sorting.safe_sort(the_diff) + the_diff = algos.safe_sort(the_diff) except TypeError: pass - return this._shallow_copy(the_diff, name=result_name, freq=None) + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None, sort=None): """ @@ -2741,16 +2541,16 @@ def symmetric_difference(self, other, result_name=None, sort=None): left_indexer = np.setdiff1d( np.arange(this.size), common_indexer, assume_unique=True ) - left_diff = this.values.take(left_indexer) + left_diff = this._values.take(left_indexer) # {other} minus {this} right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + right_diff = other._values.take(right_indexer) the_diff = concat_compat([left_diff, right_diff]) if sort is None: try: - the_diff = sorting.safe_sort(the_diff) + the_diff = algos.safe_sort(the_diff) except TypeError: pass @@ -2966,8 +2766,8 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): """ if limit is not None: raise ValueError( - "limit argument for %r method only well-defined " - "if index and target are monotonic" % method + f"limit argument for {repr(method)} method only well-defined " + "if index and target are monotonic" ) side = "left" if method == "pad" else "right" @@ -3063,11 +2863,11 @@ def _convert_scalar_indexer(self, key, kind=None): "unicode", "mixed", ]: - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) elif kind in ["loc"] and is_integer(key): if not self.holds_integer(): - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) return key @@ -3106,7 +2906,9 @@ def is_int(v): is_null_slicer = start is None and stop is None is_index_slice = is_int(start) and is_int(stop) - is_positional = is_index_slice and not self.is_integer() + is_positional = is_index_slice and not ( + self.is_integer() or self.is_categorical() + ) if kind == "getitem": """ @@ -3254,10 +3056,8 @@ def _invalid_indexer(self, form, key): Consistent invalid indexer message. """ raise TypeError( - "cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, kind=type(key) - ) + f"cannot do {form} indexing on {type(self)} with these " + f"indexers [{key}] of {type(key)}" ) # -------------------------------------------------------------------- @@ -3396,7 +3196,7 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - new_index = self._shallow_copy_with_infer(new_labels, freq=None) + new_index = self._shallow_copy_with_infer(new_labels) return new_index, indexer, new_indexer # -------------------------------------------------------------------- @@ -3416,7 +3216,7 @@ def _reindex_non_unique(self, target): return_indexers : bool, default False sort : bool, default False Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword) + the order of the join keys depends on the join type (how keyword). Returns ------- @@ -3530,7 +3330,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return join_index def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex from pandas.core.reshape.merge import _restore_dropped_levels_multijoin # figure out join names @@ -3551,8 +3351,13 @@ def _join_multi(self, other, how, return_indexers=True): ldrop_names = list(self_names - overlap) rdrop_names = list(other_names - overlap) - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + # if only the order differs + if not len(ldrop_names + rdrop_names): + self_jnlevels = self + other_jnlevels = other.reorder_levels(self.names) + else: + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) # Join left and right # Join on same leveled multi-index frames is supported @@ -3632,7 +3437,7 @@ def _join_level( MultiIndex will not be changed; otherwise, it will tie out with `other`. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex def _get_leaf_sorter(labels): """ @@ -3865,13 +3670,10 @@ def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: """ return self._data - def get_values(self): + def _internal_get_values(self): """ Return `Index` data as an `numpy.ndarray`. - .. deprecated:: 0.25.0 - Use :meth:`Index.to_numpy` or :attr:`Index.array` instead. - Returns ------- numpy.ndarray @@ -3879,7 +3681,7 @@ def get_values(self): See Also -------- - Index.values : The attribute that get_values wraps. + Index.values : The attribute that _internal_get_values wraps. Examples -------- @@ -3892,33 +3694,24 @@ def get_values(self): a 1 2 3 b 4 5 6 c 7 8 9 - >>> df.index.get_values() + >>> df.index._internal_get_values() array(['a', 'b', 'c'], dtype=object) Standalone `Index` values: >>> idx = pd.Index(['1', '2', '3']) - >>> idx.get_values() + >>> idx._internal_get_values() array(['1', '2', '3'], dtype=object) `MultiIndex` arrays also have only one dimension: >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], ... names=('number', 'letter')) - >>> midx.get_values() + >>> midx._internal_get_values() array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx.get_values().ndim + >>> midx._internal_get_values().ndim 1 """ - warnings.warn( - "The 'get_values' method is deprecated and will be removed in a " - "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, - stacklevel=2, - ) - return self._internal_get_values() - - def _internal_get_values(self): return self.values @Appender(IndexOpsMixin.memory_usage.__doc__) @@ -3970,57 +3763,13 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desired - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype("u8", copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor # in order to keep mypy happy return TypeError( - "{0}(...) must be called with a collection of some " - "kind, {1} was passed".format(cls.__name__, repr(data)) + f"{cls.__name__}(...) must be called with a collection of some " + f"kind, {repr(data)} was passed" ) @classmethod @@ -4030,30 +3779,6 @@ def _string_data_error(cls, data): "to explicitly cast to a numeric type" ) - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - raise cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -4088,21 +3813,15 @@ def _assert_can_do_op(self, value): Check value is valid for scalar op. """ if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) + raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") - @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False - - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ return self.is_object() - def is_type_compatible(self, kind): + def is_type_compatible(self, kind) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4141,35 +3860,15 @@ def is_type_compatible(self, kind): """ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: return key in self._engine except (OverflowError, TypeError, ValueError): return False - def contains(self, key): - """ - Return a boolean indicating whether the provided key is in the index. - - .. deprecated:: 0.25.0 - Use ``key in index`` instead of ``index.contains(key)``. - - Returns - ------- - bool - """ - warnings.warn( - "The 'contains' method is deprecated and will be removed in a " - "future version. Use 'key in index' instead of " - "'index.contains(key)'", - FutureWarning, - stacklevel=2, - ) - return key in self - def __hash__(self): - raise TypeError("unhashable type: %r" % type(self).__name__) + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") def __setitem__(self, key, value): raise TypeError("Index does not support mutable operations") @@ -4205,11 +3904,14 @@ def __getitem__(self, key): key = com.values_from_object(key) result = getitem(key) if not is_scalar(result): + if np.ndim(result) > 1: + deprecate_ndim_indexing(result) + return result return promote(result) else: return result - def _can_hold_identifiers_and_holds_name(self, name): + def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python identifier (e.g. in NDFrame.__getattr__, which hits this to support @@ -4264,7 +3966,13 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class. """ # must be overridden in specific classes - klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray) + klasses = ( + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex, + ExtensionArray, + ABCIntervalIndex, + ) to_concat = [ x.astype(object) if isinstance(x, klasses) else x for x in to_concat ] @@ -4300,7 +4008,7 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other): + def equals(self, other) -> bool: """ Determine if two Index objects contain the same elements. @@ -4320,11 +4028,17 @@ def equals(self, other): # if other is not object, use other's logic for coercion return other.equals(self) + if isinstance(other, ABCMultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(self.dtype): + if self.nlevels != other.nlevels: + return False + return array_equivalent( com.values_from_object(self), com.values_from_object(other) ) - def identical(self, other): + def identical(self, other) -> bool: """ Similar to equals, but check that other comparable attributes are also equal. @@ -4566,7 +4280,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError("Not supported for type %s" % type(self).__name__) + raise NotImplementedError(f"Not supported for type {type(self).__name__}") def argsort(self, *args, **kwargs): """ @@ -4626,22 +4340,26 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. - s = getattr(series, "_values", series) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - elif is_integer(key): - return s[key] + s = extract_array(series, extract_numpy=True) + if isinstance(s, ExtensionArray): + if is_scalar(key): + # GH 20882, 21257 + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + elif is_integer(key): + return s[key] + else: + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) s = com.values_from_object(series) k = com.values_from_object(key) @@ -4724,7 +4442,7 @@ def get_indexer_non_unique(self, target): if is_categorical(target): tgt_values = np.asarray(target) - elif self.is_all_dates: + elif self.is_all_dates and target.is_all_dates: # GH 30399 tgt_values = target.asi8 else: tgt_values = target._ndarray_values @@ -4751,16 +4469,15 @@ def get_indexer_for(self, target, **kwargs): def _maybe_promote(self, other): # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == "date" and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other + if self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): + return type(other)(self), other elif self.inferred_type == "boolean": if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values): + def groupby(self, values) -> Dict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4771,7 +4488,7 @@ def groupby(self, values): Returns ------- - groups : dict + dict {group name -> group labels} """ @@ -4807,7 +4524,7 @@ def map(self, mapper, na_action=None): a MultiIndex will be returned. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex new_values = super()._map_values(mapper, na_action=na_action) @@ -4926,9 +4643,9 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): Parameters ---------- start : label, default None - If None, defaults to the beginning + If None, defaults to the beginning. end : label, default None - If None, defaults to the end + If None, defaults to the end. step : int, default None kind : str, default None @@ -5074,8 +4791,8 @@ def get_slice_bound(self, label, side, kind): if side not in ("left", "right"): raise ValueError( - "Invalid value for side kwarg," - " must be either 'left' or 'right': %s" % (side,) + f"Invalid value for side kwarg, must be either" + f" 'left' or 'right': {side}" ) original_label = label @@ -5103,8 +4820,8 @@ def get_slice_bound(self, label, side, kind): slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) if isinstance(slc, np.ndarray): raise KeyError( - "Cannot get %s slice bound for non-unique " - "label: %r" % (side, original_label) + f"Cannot get {side} slice bound for non-unique " + f"label: {repr(original_label)}" ) if isinstance(slc, slice): @@ -5125,11 +4842,11 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): Parameters ---------- start : label, default None - If None, defaults to the beginning + If None, defaults to the beginning. end : label, default None - If None, defaults to the end + If None, defaults to the end. step : int, defaults None - If None, defaults to 1 + If None, defaults to 1. kind : {'ix', 'loc', 'getitem'} or None Returns @@ -5262,7 +4979,7 @@ def drop(self, labels, errors="raise"): mask = indexer == -1 if mask.any(): if errors != "ignore": - raise KeyError("{} not found in axis".format(labels[mask])) + raise KeyError(f"{labels[mask]} not found in axis") indexer = indexer[~mask] return self.delete(indexer) @@ -5528,7 +5245,7 @@ def ensure_index_from_sequences(sequences, names=None): -------- ensure_index """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex if len(sequences) == 1: if names is not None: @@ -5589,7 +5306,7 @@ def ensure_index(index_like, copy=False): converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_arrays(converted) else: @@ -5629,10 +5346,209 @@ def _trim_front(strings): def _validate_join_method(method): if method not in ["left", "right", "inner", "outer"]: - raise ValueError("do not recognize join method %s" % method) + raise ValueError(f"do not recognize join method {method}") def default_index(n): - from pandas.core.index import RangeIndex + from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) + + +def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: + """ + If no name is passed, then extract it from data, validating hashability. + """ + if name is None and isinstance(obj, (Index, ABCSeries)): + # Note we don't just check for "name" attribute since that would + # pick up e.g. dtype.name + name = obj.name + + # GH#29069 + if not is_hashable(name): + raise TypeError(f"{cls.__name__}.name must be a hashable type") + + return name + + +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _maybe_cast_data_without_dtype(subarr): + """ + If we have an arraylike input but no passed dtype, try to infer + a supported dtype. + + Parameters + ---------- + subarr : np.ndarray, Index, or Series + + Returns + ------- + converted : np.ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + """ + # Runtime import needed bc IntervalArray imports Index + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + DatetimeArray, + TimedeltaArray, + ) + + inferred = lib.infer_dtype(subarr, skipna=False) + + if inferred == "integer": + try: + data = _try_convert_to_int_array(subarr, False, None) + return data, data.dtype + except ValueError: + pass + + return subarr, object + + elif inferred in ["floating", "mixed-integer-float", "integer-na"]: + # TODO: Returns IntegerArray for integer-na case in the future + return subarr, np.float64 + + elif inferred == "interval": + try: + data = IntervalArray._from_sequence(subarr, copy=False) + return data, data.dtype + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass + elif inferred == "boolean": + # don't support boolean explicitly ATM + pass + elif inferred != "string": + if inferred.startswith("datetime"): + try: + data = DatetimeArray._from_sequence(subarr, copy=False) + return data, data.dtype + except (ValueError, OutOfBoundsDatetime): + # GH 27011 + # If we have mixed timezones, just send it + # down the base constructor + pass + + elif inferred.startswith("timedelta"): + data = TimedeltaArray._from_sequence(subarr, copy=False) + return data, data.dtype + elif inferred == "period": + try: + data = PeriodArray._from_sequence(subarr) + return data, data.dtype + except IncompatibleFrequency: + pass + + return subarr, subarr.dtype + + +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError + + +def deprecate_ndim_indexing(result): + if np.ndim(result) > 1: + # GH#27125 indexer like idx[:, None] expands dim, but we + # cannot do that and keep an index, so return ndarray + # Deprecation GH#30588 + warnings.warn( + "Support for multi-dimensional indexing (e.g. `index[:, None]`) " + "on an Index is deprecated and will be removed in a future " + "version. Convert to a numpy array before indexing instead.", + DeprecationWarning, + stacklevel=3, + ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e5a8edb56e413..a247a986fcb55 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,4 @@ -import operator -from typing import Any +from typing import Any, List import warnings import numpy as np @@ -8,9 +7,8 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 -import pandas.compat as compat -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas._typing import AnyArrayLike +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, @@ -23,14 +21,13 @@ from pandas.core.dtypes.generic import ABCCategorical, ABCSeries from pandas.core.dtypes.missing import isna -from pandas._typing import AnyArrayLike from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains -from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.extension import ExtensionIndex import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -38,6 +35,12 @@ _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) +@accessor.delegate_names( + delegate=Categorical, + accessors=["codes", "categories", "ordered"], + typ="property", + overwrite=True, +) @accessor.delegate_names( delegate=Categorical, accessors=[ @@ -51,11 +54,17 @@ "as_unordered", "min", "max", + "is_dtype_equal", + "tolist", + "_internal_get_values", + "_reverse_indexer", + "searchsorted", + "argsort", ], typ="method", overwrite=True, ) -class CategoricalIndex(Index, accessor.PandasDelegate): +class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -122,7 +131,7 @@ class CategoricalIndex(Index, accessor.PandasDelegate): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -148,6 +157,20 @@ class CategoricalIndex(Index, accessor.PandasDelegate): _typ = "categoricalindex" + _raw_inherit = { + "argsort", + "_internal_get_values", + "tolist", + "codes", + "categories", + "ordered", + "_reverse_indexer", + "searchsorted", + } + + codes: np.ndarray + categories: Index + @property def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need @@ -165,30 +188,12 @@ def _engine_type(self): # Constructors def __new__( - cls, - data=None, - categories=None, - ordered=None, - dtype=None, - copy=False, - name=None, - fastpath=None, + cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): - if fastpath is not None: - warnings.warn( - "The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, - stacklevel=2, - ) - if fastpath: - return cls._simple_new(data, name=name, dtype=dtype) - dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) if not is_categorical_dtype(data): # don't allow scalars @@ -256,16 +261,15 @@ def _create_categorical(cls, data, dtype=None): return data @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) values = cls._create_categorical(values, dtype=dtype) result._data = values result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() + result._no_setting_name = False return result # -------------------------------------------------------------------- @@ -276,7 +280,7 @@ def _shallow_copy(self, values=None, dtype=None, **kwargs): dtype = self.dtype return super()._shallow_copy(values=values, dtype=dtype, **kwargs) - def _is_dtype_compat(self, other): + def _is_dtype_compat(self, other) -> bool: """ *this is an internal non-public method* @@ -357,7 +361,7 @@ def _format_attrs(self): ] if self.name is not None: attrs.append(("name", ibase.default_pprint(self.name))) - attrs.append(("dtype", "'%s'" % self.dtype.name)) + attrs.append(("dtype", f"'{self.dtype.name}'")) max_seq_items = get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: attrs.append(("length", len(self))) @@ -366,7 +370,7 @@ def _format_attrs(self): # -------------------------------------------------------------------- @property - def inferred_type(self): + def inferred_type(self) -> str: return "categorical" @property @@ -374,47 +378,21 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def itemsize(self): - # Size of the items in categories, not codes. - return self.values.itemsize - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) + # We use _shallow_copy rather than the Index implementation + # (which uses _constructor) in order to preserve dtype. return self._shallow_copy(result, name=name) - def _internal_get_values(self): - # override base Index version to get the numpy array representation of - # the underlying Categorical - return self._data._internal_get_values() - - def tolist(self): - return self._data.tolist() - - @property - def codes(self): - return self._data.codes - - @property - def categories(self): - return self._data.categories - - @property - def ordered(self): - return self._data.ordered - - def _reverse_indexer(self): - return self._data._reverse_indexer() - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans return contains(self, key, container=self._engine) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return np.array(self._data, dtype=dtype) @@ -430,7 +408,7 @@ def astype(self, dtype, copy=True): if dtype == self.dtype: return self.copy() if copy else self - return super().astype(dtype=dtype, copy=copy) + return Index.astype(self, dtype=dtype, copy=copy) @cache_readonly def _isnan(self): @@ -442,9 +420,6 @@ def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) - def argsort(self, *args, **kwargs): - return self.values.argsort(*args, **kwargs) - @cache_readonly def _engine(self): # we are going to look things up with the codes themselves. @@ -453,19 +428,6 @@ def _engine(self): codes = self.codes return self._engine_type(lambda: codes, len(self)) - # introspection - @cache_readonly - def is_unique(self): - return self._engine.is_unique - - @property - def is_monotonic_increasing(self): - return self._engine.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self): - return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: @@ -552,11 +514,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - @Substitution(klass="CategoricalIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - return self._data.searchsorted(value, side=side, sorter=sorter) - @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -596,6 +553,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ibase.ensure_index(target) + missing: List[int] if self.equals(target): indexer = None missing = [] @@ -708,9 +666,11 @@ def get_indexer_non_unique(self, target): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - if self.categories._defer_to_indexing: - return self.categories._convert_scalar_indexer(key, kind=kind) - + if kind == "loc": + try: + return self.categories._convert_scalar_indexer(key, kind=kind) + except TypeError: + self._invalid_indexer("label", key) return super()._convert_scalar_indexer(key, kind=kind) @Appender(_index_shared_docs["_convert_list_indexer"]) @@ -725,9 +685,7 @@ def _convert_list_indexer(self, keyarr, kind=None): indexer = self.categories.get_indexer(np.asarray(keyarr)) if (indexer == -1).any(): raise KeyError( - "a list-indexer must only " - "include values that are " - "in the categories" + "a list-indexer must only include values that are in the categories" ) return self.get_indexer(keyarr) @@ -745,23 +703,21 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - taken = self._assert_take_fillable( - self.codes, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1, + def take_nd(self, *args, **kwargs): + """Alias for `take`""" + warnings.warn( + "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead", + FutureWarning, + stacklevel=2, ) - return self._create_from_codes(taken) + return self.take(*args, **kwargs) - def is_dtype_equal(self, other): - return self._data.is_dtype_equal(other) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) + def _maybe_cast_slice_bound(self, label, side, kind): + if kind == "loc": + return label - take_nd = take + return super()._maybe_cast_slice_bound(label, side, kind) def map(self, mapper): """ @@ -887,34 +843,10 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _codes_for_groupby(self, sort, observed): - """ Return a Categorical adjusted for groupby """ - return self.values._codes_for_groupby(sort, observed) - - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - - def _make_compare(op): - opname = "__{op}__".format(op=op.__name__) - - def _evaluate_compare(self, other): - with np.errstate(all="ignore"): - result = op(self.array, other) - if isinstance(result, ABCSeries): - # Dispatch to pd.Categorical returned NotImplemented - # and we got a Series back; down-cast to ndarray - result = result._values - return result - - return compat.set_function_name(_evaluate_compare, opname, cls) - - cls.__eq__ = _make_compare(operator.eq) - cls.__ne__ = _make_compare(operator.ne) - cls.__lt__ = _make_compare(operator.lt) - cls.__gt__ = _make_compare(operator.gt) - cls.__le__ = _make_compare(operator.le) - cls.__ge__ = _make_compare(operator.ge) + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._values, name) + return prop # no wrapping for now def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ @@ -922,7 +854,7 @@ def _delegate_method(self, name, *args, **kwargs): if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if is_scalar(res): + if is_scalar(res) or name in self._raw_inherit: return res return CategoricalIndex(res, name=self.name) @@ -930,4 +862,3 @@ def _delegate_method(self, name, *args, **kwargs): CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() -CategoricalIndex._add_comparison_methods() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index bf89bbbdf2b79..c4dac9d1c4a11 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,154 +2,106 @@ Base and utility classes for tseries type pandas objects. """ import operator -from typing import Set -import warnings +from typing import List, Optional, Set import numpy as np -from pandas._libs import NaT, iNaT, lib +from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import timezones from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_int64, is_bool_dtype, + is_categorical_dtype, is_dtype_equal, is_float, is_integer, is_list_like, is_period_dtype, is_scalar, + needs_i8_conversion, ) +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna -from pandas.core import algorithms, ops +from pandas.core import algorithms from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import ExtensionOpsMixin -from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin, - _ensure_datetimelike_to_i8, -) +from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.extension import ( + ExtensionIndex, + inherit_names, + make_wrapped_arith_op, +) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta -import pandas.io.formats.printing as printing -from pandas.tseries.frequencies import to_offset +from pandas.tseries.frequencies import DateOffset, to_offset _index_doc_kwargs = dict(ibase._index_doc_kwargs) -def ea_passthrough(array_method): +def _join_i8_wrapper(joinf, with_indexers: bool = True): """ - Make an alias for a method of the underlying ExtensionArray. - - Parameters - ---------- - array_method : method on an Array class - - Returns - ------- - method + Create the join wrapper methods. """ - def method(self, *args, **kwargs): - return array_method(self._data, *args, **kwargs) + @staticmethod # type: ignore + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + left = left.view("i8") + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + right = right.view("i8") - method.__name__ = array_method.__name__ - method.__doc__ = array_method.__doc__ - return method + results = joinf(left, right) + if with_indexers: + # dtype should be timedelta64[ns] for TimedeltaIndex + # and datetime64[ns] for DatetimeIndex + dtype = left.dtype.base + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results -def _make_wrapped_arith_op(opname): - def method(self, other): - meth = getattr(self._data, opname) - result = meth(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) + return wrapper - method.__name__ = opname - return method - -class DatetimeIndexOpsMixin(ExtensionOpsMixin): +@inherit_names( + ["inferred_freq", "_isnan", "_resolution", "resolution"], + DatetimeLikeArrayMixin, + cache=True, +) +@inherit_names( + ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"], + DatetimeLikeArrayMixin, +) +class DatetimeIndexOpsMixin(ExtensionIndex): """ - common ops mixin to support a unified interface datetimelike Index + Common ops mixin to support a unified interface datetimelike Index. """ - _data = None + _data: ExtensionArray + freq: Optional[DateOffset] + freqstr: Optional[str] + _resolution: int + _bool_ops: List[str] = [] + _field_ops: List[str] = [] - # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are - # properties there. They can be made into cache_readonly for Index - # subclasses bc they are immutable - inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget # type: ignore - ) - _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget # type: ignore - ) - resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - - _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) - __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) - mean = ea_passthrough(DatetimeLikeArrayMixin.mean) - - @property - def freq(self): - """ - Return the frequency object if it is set, otherwise None. - """ - return self._data.freq - - @freq.setter - def freq(self, value): - # validation is handled by _data setter - self._data.freq = value - - @property - def freqstr(self): - """ - Return the frequency object as a string if it is set, otherwise None. - """ - return self._data.freqstr - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - - result = self._data.unique() - - # Note: if `self` is already unique, then self.unique() should share - # a `freq` with self. If not already unique, then self.freq must be - # None, so again sharing freq is correct. - return self._shallow_copy(result._data) - - @classmethod - def _create_comparison_method(cls, op): - """ - Create a comparison method that dispatches to ``cls.values``. - """ - - def wrapper(self, other): - if isinstance(other, ABCSeries): - # the arrays defer to Series for comparison ops but the indexes - # don't, so we have to unwrap here. - other = other._values - - result = op(self._data, maybe_unwrap_index(other)) - return result - - wrapper.__doc__ = op.__doc__ - wrapper.__name__ = "__{}__".format(op.__name__) - return wrapper @property - def _ndarray_values(self): - return self._data._ndarray_values + def is_all_dates(self) -> bool: + return True # ------------------------------------------------------------------------ # Abstract data attributes @@ -159,11 +111,6 @@ def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data - @property # type: ignore # https://github.com/python/mypy/issues/1362 - @Appender(DatetimeLikeArrayMixin.asi8.__doc__) - def asi8(self): - return self._data.asi8 - def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. @@ -180,7 +127,7 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -203,55 +150,8 @@ def equals(self, other): # have different timezone return False - elif is_period_dtype(self): - if not is_period_dtype(other): - return False - if self.freq != other.freq: - return False - return np.array_equal(self.asi8, other.asi8) - @staticmethod - def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ - Create the join wrapper methods. - """ - from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin - - @staticmethod - def wrapper(left, right): - if isinstance( - left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - left = left.view("i8") - if isinstance( - right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - right = right.view("i8") - results = joinf(left, right) - if with_indexers: - join_index, left_indexer, right_indexer = results - join_index = join_index.view(dtype) - return join_index, left_indexer, right_indexer - return results - - return wrapper - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, "tz", None): - # ensure_localized is only relevant for tz-aware DTI - result = self._data._ensure_localized( - arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc - ) - return type(self)._simple_new(result, name=self.name) - return arg - - def _box_values(self, values): - return self._data._box_values(values) - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): try: @@ -291,7 +191,10 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._ndarray_values) + # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 + # because the treatment of NaT has been changed to put NaT last + # instead of first. + sorted_values = np.sort(self.asi8) attribs = self._get_attributes_dict() freq = attribs["freq"] @@ -316,40 +219,15 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if isinstance(maybe_slice, slice): return self[maybe_slice] - taken = self._assert_take_fillable( - self.asi8, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=iNaT, + return ExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs ) - # keep freq in PeriodArray/Index, reset otherwise - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(taken, freq=freq) - _can_hold_na = True _na_value = NaT """The expected NA value to use with this index.""" - @property - def asobject(self): - """ - Return object Index which contains boxed values. - - .. deprecated:: 0.23.0 - Use ``astype(object)`` instead. - - *this is an internal non-public method* - """ - warnings.warn( - "'asobject' is deprecated. Use 'astype(object)' instead", - FutureWarning, - stacklevel=2, - ) - return self.astype(object) - def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) @@ -357,7 +235,7 @@ def _convert_tolerance(self, tolerance, target): raise ValueError("list-like tolerance size must match target index size") return tolerance - def tolist(self): + def tolist(self) -> List: """ Return a list of the underlying data. """ @@ -496,7 +374,7 @@ def _format_attrs(self): if attrib == "freq": freq = self.freqstr if freq is not None: - freq = "'%s'" % freq + freq = repr(freq) attrs.append(("freq", freq)) return attrs @@ -527,51 +405,22 @@ def _convert_scalar_indexer(self, key, kind=None): return super()._convert_scalar_indexer(key, kind=kind) - @classmethod - def _add_datetimelike_methods(cls): - """ - Add in the datetimelike methods (as we may have to override the - superclass). - """ - - def __add__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__add__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__add__ = __add__ - - def __radd__(self, other): - # alias for __add__ - return self.__add__(other) - - cls.__radd__ = __radd__ - - def __sub__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__sub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__sub__ = __sub__ - - def __rsub__(self, other): - result = self._data.__rsub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__rsub__ = __rsub__ - - __pow__ = _make_wrapped_arith_op("__pow__") - __rpow__ = _make_wrapped_arith_op("__rpow__") - __mul__ = _make_wrapped_arith_op("__mul__") - __rmul__ = _make_wrapped_arith_op("__rmul__") - __floordiv__ = _make_wrapped_arith_op("__floordiv__") - __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") - __mod__ = _make_wrapped_arith_op("__mod__") - __rmod__ = _make_wrapped_arith_op("__rmod__") - __divmod__ = _make_wrapped_arith_op("__divmod__") - __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") - __truediv__ = _make_wrapped_arith_op("__truediv__") - __rtruediv__ = _make_wrapped_arith_op("__rtruediv__") + __add__ = make_wrapped_arith_op("__add__") + __radd__ = make_wrapped_arith_op("__radd__") + __sub__ = make_wrapped_arith_op("__sub__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") def isin(self, values, level=None): """ @@ -597,78 +446,29 @@ def isin(self, values, level=None): return algorithms.isin(self.asi8, values.asi8) - def intersection(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return self.copy() - if len(other) == 0: - return other.copy() - - if not isinstance(other, type(self)): - result = Index.intersection(self, other, sort=sort) - if isinstance(result, type(self)): - if result.freq is None: - result.freq = to_offset(result.inferred_freq) - return result - - elif ( - other.freq is None - or self.freq is None - or other.freq != self.freq - or not other.freq.isAnchored() - or (not self.is_monotonic or not other.is_monotonic) - ): - result = Index.intersection(self, other, sort=sort) + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) + def where(self, cond, other=None): + values = self.view("i8") - # Invalidate the freq of `result`, which may not be correct at - # this point, depending on the values. - result.freq = None - if hasattr(self, "tz"): - result = self._shallow_copy( - result._values, name=result.name, tz=result.tz, freq=None - ) - else: - result = self._shallow_copy(result._values, name=result.name, freq=None) - if result.freq is None: - result.freq = to_offset(result.inferred_freq) - return result + if is_scalar(other) and isna(other): + other = NaT.value - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other else: - left, right = other, self + # Do type inference if necessary up front + # e.g. we passed PeriodIndex.values and got an ndarray of Periods + other = Index(other) - # after sorting, the intersection always starts with the right index - # and ends with the index of which the last elements is smallest - end = min(left[-1], right[-1]) - start = right[0] + if is_categorical_dtype(other): + # e.g. we have a Categorical holding self.dtype + if needs_i8_conversion(other.categories): + other = other._internal_get_values() - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + if not is_dtype_equal(self.dtype, other.dtype): + raise TypeError(f"Where requires matching dtype, not {other.dtype}") - @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + other = other.view("i8") - @Appender(_index_shared_docs["where"] % _index_doc_kwargs) - def where(self, cond, other=None): - other = _ensure_datetimelike_to_i8(other, to_utc=True) - values = _ensure_datetimelike_to_i8(self, to_utc=True) result = np.where(cond, values, other).astype("i8") - - result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result) def _summary(self, name=None): @@ -678,27 +478,24 @@ def _summary(self, name=None): Parameters ---------- name : str - name to use in the summary representation + Name to use in the summary representation. Returns ------- - String with a summarized representation of the index + str + Summarized representation of the index. """ formatter = self._formatter_func if len(self) > 0: - index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1])) + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" else: index_summary = "" if name is None: name = type(self).__name__ - result = "%s: %s entries%s" % ( - printing.pprint_thing(name), - len(self), - index_summary, - ) + result = f"{name}: {len(self)} entries{index_summary}" if self.freq: - result += "\nFreq: %s" % self.freqstr + result += f"\nFreq: {self.freqstr}" # display as values, not quoted result = result.replace("'", "") @@ -725,20 +522,7 @@ def _concat_same_dtype(self, to_concat, name): return self._simple_new(new_data, **attribs) - @Appender(_index_shared_docs["astype"]) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype) and copy is False: - # Ensure that self.astype(self.dtype) is self - return self - - new_values = self._data.astype(dtype, copy=copy) - - # pass copy=False because any copying will be done in the - # _data.astype call above - return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - - @deprecate_kwarg(old_arg_name="n", new_arg_name="periods") - def shift(self, periods, freq=None): + def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -747,7 +531,7 @@ def shift(self, periods, freq=None): Parameters ---------- - periods : int + periods : int, default 1 Number of periods (or increments) to shift by, can be positive or negative. @@ -771,45 +555,339 @@ def shift(self, periods, freq=None): result = self._data._time_shift(periods, freq=freq) return type(self)(result, name=self.name) + # -------------------------------------------------------------------- + # List-like Methods -def wrap_arithmetic_op(self, other, result): - if result is NotImplemented: - return NotImplemented + def delete(self, loc): + new_i8s = np.delete(self.asi8, loc) - if isinstance(result, tuple): - # divmod, rdivmod - assert len(result) == 2 - return ( - wrap_arithmetic_op(self, other, result[0]), - wrap_arithmetic_op(self, other, result[1]), - ) - - if not isinstance(result, Index): - # Index.__new__ will choose appropriate subclass for dtype - result = Index(result) + freq = None + if is_period_dtype(self): + freq = self.freq + elif is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq - res_name = ops.get_op_result_name(self, other) - result.name = res_name - return result + return self._shallow_copy(new_i8s, freq=freq) -def maybe_unwrap_index(obj): +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex """ - If operating against another Index object, we need to unwrap the underlying - data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray - implementation, otherwise we will incorrectly return NotImplemented. - Parameters - ---------- - obj : object + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique - Returns - ------- - unwrapped object - """ - if isinstance(obj, ABCIndexClass): - return obj._data - return obj + def _set_freq(self, freq): + """ + Set the _freq attribute on our underlying DatetimeArray. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaIndex case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + self._data._freq = freq + + def _shallow_copy(self, values=None, **kwargs): + if values is None: + values = self._data + if isinstance(values, type(self)): + values = values._data + + attributes = self._get_attributes_dict() + + if "freq" not in kwargs and self.freq is not None: + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.freq is None: + del attributes["freq"] + + attributes.update(kwargs) + return self._simple_new(values, **attributes) + + # -------------------------------------------------------------------- + # Set Operation Methods + + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx._set_freq(None) + return new_idx + + def intersection(self, other, sort=False): + """ + Specialized intersection for DatetimeIndex/TimedeltaIndex. + + May be much faster than Index.intersection + + Parameters + ---------- + other : Same type as self or array-like + sort : False or None, default False + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + .. versionchanged:: 0.25.0 + + The `sort` keyword is added + + Returns + ------- + y : Index or same type as self + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if len(self) == 0: + return self.copy() + if len(other) == 0: + return other.copy() + + if not isinstance(other, type(self)): + result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + result._set_freq("infer") + return result + + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.is_anchored() + or (not self.is_monotonic or not other.is_monotonic) + ): + result = Index.intersection(self, other, sort=sort) + + # Invalidate the freq of `result`, which may not be correct at + # this point, depending on the values. + + result._set_freq(None) + result = self._shallow_copy( + result._data, name=result.name, dtype=result.dtype, freq=None + ) + if result.freq is None: + result._set_freq("infer") + return result + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _can_fast_union(self, other) -> bool: + if not isinstance(other, type(self)): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + freq) or right_start in left + except ValueError: + # if we are comparing a freq that does not propagate timezones + # this will raise + return False + + def _fast_union(self, other, sort=None): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + elif sort is False: + # TDIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side="left") + right_chunk = right.values[:loc] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + left, right = other, self + + left_end = left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side="right") + right_chunk = right.values[loc:] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + return left + + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other, sort=sort) + else: + result = Index._union(this, other, sort=sort) + if isinstance(result, type(self)): + assert result._data.dtype == this.dtype + if result.freq is None: + result._set_freq("infer") + return result + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) + _left_indexer_unique = _join_i8_wrapper( + libjoin.left_join_indexer_unique, with_indexers=False + ) + + def join( + self, other, how: str = "left", level=None, return_indexers=False, sort=False + ): + """ + See Index.join + """ + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other + + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if ( + isinstance(other, type(self)) + and self.freq == other.freq + and self._can_fast_union(other) + ): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + kwargs = {} + if hasattr(self, "tz"): + kwargs["tz"] = getattr(other, "tz", None) + return self._simple_new(joined, name, **kwargs) class DatetimelikeDelegateMixin(PandasDelegate): @@ -819,8 +897,6 @@ class DatetimelikeDelegateMixin(PandasDelegate): Functionality is delegated from the Index class to an Array class. A few things can be customized - * _delegate_class : type - The class being delegated to. * _delegated_methods, delegated_properties : List The list of property / method names being delagated. * raw_methods : Set @@ -832,15 +908,10 @@ class DatetimelikeDelegateMixin(PandasDelegate): """ # raw_methods : dispatch methods that shouldn't be boxed in an Index - _raw_methods = set() # type: Set[str] + _raw_methods: Set[str] = set() # raw_properties : dispatch properties that shouldn't be boxed in an Index - _raw_properties = set() # type: Set[str] - name = None - _data = None - - @property - def _delegate_class(self): - raise AbstractMethodError + _raw_properties: Set[str] = set() + _data: ExtensionArray def _delegate_property_get(self, name, *args, **kwargs): result = getattr(self._data, name) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ee2f4e0f1e85d..2241921e94694 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,43 +1,32 @@ -from datetime import datetime, time, timedelta +from datetime import datetime, time, timedelta, tzinfo import operator +from typing import Optional import warnings import numpy as np from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts -import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.dtypes.common import ( - _NS_DTYPE, - ensure_int64, - is_float, - is_integer, - is_list_like, - is_scalar, - is_string_like, -) -from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( DatetimeArray, - _to_M8, tz_to_dtype, validate_tz_from_dtype, ) from pandas.core.base import _shared_docs import pandas.core.common as com -from pandas.core.indexes.base import Index +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, - ea_passthrough, + DatetimeTimedeltaMixin, ) -from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.extension import inherit_names from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -46,20 +35,20 @@ def _new_DatetimeIndex(cls, d): - """ This is called upon unpickling, rather than the default which doesn't - have arguments and breaks __new__ """ - + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__ + """ if "data" in d and not isinstance(d["data"], DatetimeIndex): # Avoid need to verify integrity by calling simple_new directly data = d.pop("data") result = cls._simple_new(data, **d) else: with warnings.catch_warnings(): - # we ignore warnings from passing verify_integrity=False # TODO: If we knew what was going in to **d, we might be able to # go through _simple_new instead warnings.simplefilter("ignore") - result = cls.__new__(cls, verify_integrity=False, **d) + result = cls.__new__(cls, **d) return result @@ -70,8 +59,14 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] - _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] - _extra_raw_properties = ["_box_func", "tz", "tzinfo"] + _extra_raw_methods = [ + "to_pydatetime", + "_local_timestamps", + "_has_same_tz", + "_format_native_types", + "__iter__", + ] + _extra_raw_properties = ["_box_func", "tz", "tzinfo", "dtype"] _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties _delegated_methods = ( DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods @@ -82,9 +77,19 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): | set(_extra_raw_properties) ) _raw_methods = set(_extra_raw_methods) - _delegate_class = DatetimeArray +@inherit_names(["_timezone", "is_normalized", "_resolution"], DatetimeArray, cache=True) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + ], + DatetimeArray, +) @delegate_names( DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" ) @@ -92,9 +97,9 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): DatetimeArray, DatetimeDelegateMixin._delegated_methods, typ="method", - overwrite=False, + overwrite=True, ) -class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): +class DatetimeIndex(DatetimeTimedeltaMixin, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -102,39 +107,14 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Parameters ---------- - data : array-like (1-dimensional), optional - Optional datetime-like data to construct index with - copy : bool - Make a copy of input ndarray + data : array-like (1-dimensional), optional + Optional datetime-like data to construct index with. + copy : bool + Make a copy of input ndarray. freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation - - start : starting value, datetime-like, optional - If data is None, start is used as the start point in generating regular - timestamp data. - - .. deprecated:: 0.24.0 - - periods : int, optional, > 0 - Number of periods to generate, if generating index. Takes precedence - over end argument - - .. deprecated:: 0.24.0 - - end : end time, datetime-like, optional - If periods is none, generated index will extend to first conforming - time on or just past end argument - - .. deprecated:: 0.24.0 - - closed : str or None, default None - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) - - .. deprecated:: 0.24. 0 - + inferred frequency upon creation. tz : pytz.timezone or dateutil.tz.tzfile ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. @@ -218,44 +198,21 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Notes ----- To learn more about the frequency strings, please see `this link - `__. - - Creating a DatetimeIndex based on `start`, `periods`, and `end` has - been deprecated in favor of :func:`date_range`. + `__. """ _typ = "datetimeindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True - _tz = None - _freq = None _comparables = ["name", "freqstr", "tz"] _attributes = ["name", "tz", "freq"] _is_numeric_dtype = False _infer_as_myclass = True - # Use faster implementation given we know we have DatetimeArrays - __iter__ = DatetimeArray.__iter__ - # some things like freq inference make use of these attributes. - _bool_ops = DatetimeArray._bool_ops - _object_ops = DatetimeArray._object_ops - _field_ops = DatetimeArray._field_ops - _datetimelike_ops = DatetimeArray._datetimelike_ops - _datetimelike_methods = DatetimeArray._datetimelike_methods + tz: Optional[tzinfo] # -------------------------------------------------------------------- # Constructors @@ -264,9 +221,6 @@ def __new__( cls, data=None, freq=None, - start=None, - end=None, - periods=None, tz=None, normalize=False, closed=None, @@ -276,51 +230,17 @@ def __new__( dtype=None, copy=False, name=None, - verify_integrity=None, ): - if verify_integrity is not None: - warnings.warn( - "The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - verify_integrity = True - - if data is None: - dtarr = DatetimeArray._generate_range( - start, - end, - periods, - freq=freq, - tz=tz, - normalize=normalize, - closed=closed, - ambiguous=ambiguous, - ) - warnings.warn( - "Creating a DatetimeIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.date_range` instead.", - FutureWarning, - stacklevel=2, - ) - return cls._simple_new(dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name) - if is_scalar(data): raise TypeError( - "{cls}() must be called with a " - "collection of some kind, {data} was passed".format( - cls=cls.__name__, data=repr(data) - ) + f"{cls.__name__}() must be called with a " + f"collection of some kind, {repr(data)} was passed" ) # - Cases checked above all return/raise before reaching here - # - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) dtarr = DatetimeArray._from_sequence( data, @@ -331,7 +251,6 @@ def __new__( dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous, - int_as_wall_time=True, ) subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) @@ -340,7 +259,7 @@ def __new__( @classmethod def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): """ - we require the we have a dtype compat for the values + We require the we have a dtype compat for the values if we are passed a non-dtype compat, then coerce using the constructor """ if isinstance(values, DatetimeArray): @@ -366,6 +285,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): result = object.__new__(cls) result._data = dtarr result.name = name + result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data result._index_data = dtarr._data result._reset_identity() @@ -373,46 +293,18 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): # -------------------------------------------------------------------- - def __array__(self, dtype=None): - if ( - dtype is None - and isinstance(self._data, DatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._data, dtype=dtype) - @property - def dtype(self): - return self._data.dtype - - @property - def tz(self): - # GH 18595 - return self._data.tz - - @tz.setter - def tz(self, value): - # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError( - "Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate" - ) - - tzinfo = tz - @cache_readonly - def _is_dates_only(self): - """Return a boolean if we are only dates (and don't have a timezone)""" + def _is_dates_only(self) -> bool: + """ + Return a boolean if we are only dates (and don't have a timezone) + + Returns + ------- + bool + """ from pandas.io.formats.format import _is_dates_only return _is_dates_only(self.values) and self.tz is None @@ -424,45 +316,14 @@ def __reduce__(self): d = dict(data=self._data) d.update(self._get_attributes_dict()) - return _new_DatetimeIndex, (self.__class__, d), None - - def __setstate__(self, state): - """Necessary for making this object picklable""" - if isinstance(state, dict): - super().__setstate__(state) - - elif isinstance(state, tuple): - - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - freq = own_state[1] - tz = timezones.tz_standardize(own_state[2]) - dtype = tz_to_dtype(tz) - dtarr = DatetimeArray._simple_new(data, freq=freq, dtype=dtype) - - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - dtarr = DatetimeArray(data) - - self._data = dtarr - self._reset_identity() - - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ + return _new_DatetimeIndex, (type(self), d), None def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ + """ + Convert value to be insertable to ndarray. + """ if self._has_same_tz(value): - return _to_M8(value) + return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") # -------------------------------------------------------------------- @@ -472,57 +333,19 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return libts.ints_to_pydatetime(self.asi8, self.tz) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_datetime64_from_values - - fmt = _get_format_datetime64_from_values(self, date_format) - - return libts.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep - ) - @property def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) + return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- # Set Operation Methods - def _union(self, other, sort): - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) - - if not isinstance(other, DatetimeIndex): - try: - other = DatetimeIndex(other) - except TypeError: - pass - - this, other = self._maybe_utc_convert(other) - - if this._can_fast_union(other): - return this._fast_union(other, sort=sort) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, DatetimeIndex): - # TODO: we shouldn't be setting attributes like this; - # in all the tests this equality already holds - result._data._dtype = this.dtype - if result.freq is None and ( - this.freq is not None or other.freq is not None - ): - result.freq = to_offset(result.inferred_freq) - return result - def union_many(self, others): """ - A bit of a hack to accelerate unioning a collection of indexes + A bit of a hack to accelerate unioning a collection of indexes. """ this = self @@ -550,102 +373,6 @@ def union_many(self, others): this._data._dtype = dtype return this - def _can_fast_union(self, other): - if not isinstance(other, DatetimeIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - try: - return (right_start == left_end + freq) or right_start in left - except (ValueError): - - # if we are comparing a freq that does not propagate timezones - # this will raise - return False - - def _fast_union(self, other, sort=None): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # Both DTIs are monotonic. Check if they are already - # in the "correct" order - if self[0] <= other[0]: - left, right = self, other - # DTIs are not in the "correct" order and we don't want - # to sort but want to remove overlaps - elif sort is False: - left, right = self, other - left_start = left[0] - loc = right.searchsorted(left_start, side="left") - right_chunk = right.values[:loc] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - # DTIs are not in the "correct" order and we want - # to sort - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # TODO: consider re-implementing freq._should_cache for fastpath - - # concatenate dates - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - - def intersection(self, other, sort=False): - """ - Specialized intersection for DatetimeIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - Returns - ------- - y : Index or DatetimeIndex or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name, freq=None, tz=self.tz) @@ -658,14 +385,14 @@ def _get_time_micros(self): values = self._data._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=None, index=None, name=None): + def to_series(self, keep_tz=lib.no_default, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. Parameters ---------- - keep_tz : optional, defaults False + keep_tz : optional, defaults True Return the data keeping the timezone. If keep_tz is True: @@ -681,10 +408,10 @@ def to_series(self, keep_tz=None, index=None, name=None): Series will have a datetime64[ns] dtype. TZ aware objects will have the tz removed. - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. + .. versionchanged:: 1.0.0 + The default value is now True. In a future version, + this keyword will be removed entirely. Stop passing the + argument to obtain the future behavior and silence the warning. index : Index, optional Index of resulting Series. If None, defaults to original index. @@ -703,27 +430,27 @@ def to_series(self, keep_tz=None, index=None, name=None): if name is None: name = self.name - if keep_tz is None and self.tz is not None: - warnings.warn( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", - FutureWarning, - stacklevel=2, - ) - keep_tz = False - elif keep_tz is False: - warnings.warn( - "Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", - FutureWarning, - stacklevel=2, - ) + if keep_tz is not lib.no_default: + if keep_tz: + warnings.warn( + "The 'keep_tz' keyword in DatetimeIndex.to_series " + "is deprecated and will be removed in a future version. " + "You can stop passing 'keep_tz' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + else: + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) + else: + keep_tz = True if keep_tz and self.tz is not None: # preserve the tz & copy @@ -748,7 +475,7 @@ def snap(self, freq="S"): for i, v in enumerate(self): s = v - if not freq.onOffset(s): + if not freq.is_on_offset(s): t0 = freq.rollback(s) t1 = freq.rollforward(s) if abs(s - t0) < abs(t1 - s): @@ -760,66 +487,6 @@ def snap(self, freq="S"): # we know it conforms; skip check return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - if ( - not isinstance(other, DatetimeIndex) - and len(other) > 0 - and other.inferred_type - not in ( - "floating", - "integer", - "integer-na", - "mixed-integer", - "mixed-integer-float", - "mixed", - ) - ): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join( - this, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, DatetimeIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, "tz", None) - return self._simple_new(joined, name, tz=tz) - def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -901,9 +568,8 @@ def _parsed_string_to_bounds(self, reso, parsed): if parsed.tzinfo is not None: if self.tz is None: raise ValueError( - "The index must be timezone aware " - "when indexing with a date string with a " - "UTC offset" + "The index must be timezone aware when indexing " + "with a date string with a UTC offset" ) start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) @@ -912,7 +578,16 @@ def _parsed_string_to_bounds(self, reso, parsed): end = end.tz_localize(self.tz) return start, end - def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): + def _partial_date_slice( + self, reso: str, parsed, use_lhs: bool = True, use_rhs: bool = True + ): + """ + Parameters + ---------- + reso : str + use_lhs : bool, default True + use_rhs : bool, default True + """ is_monotonic = self.is_monotonic if ( is_monotonic @@ -965,15 +640,7 @@ def get_value(self, series, key): know what you're doing """ - if isinstance(key, datetime): - - # needed to localize naive datetimes - if self.tz is not None: - if key.tzinfo is not None: - key = Timestamp(key).tz_convert(self.tz) - else: - key = Timestamp(key).tz_localize(self.tz) - + if isinstance(key, (datetime, np.datetime64)): return self.get_value_maybe_box(series, key) if isinstance(key, time): @@ -981,7 +648,7 @@ def get_value(self, series, key): return series.take(locs) try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) + value = Index.get_value(self, series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -993,6 +660,8 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) def get_value_maybe_box(self, series, key): # needed to localize naive datetimes @@ -1032,9 +701,7 @@ def get_loc(self, key, method=None, tolerance=None): elif isinstance(key, timedelta): # GH#20464 raise TypeError( - "Cannot index {cls} with {other}".format( - cls=type(self).__name__, other=type(key).__name__ - ) + f"Cannot index {type(self).__name__} with {type(key).__name__}" ) if isinstance(key, time): @@ -1163,88 +830,41 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): else: raise - # -------------------------------------------------------------------- - # Wrapping DatetimeArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore - is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore - _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - - _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) - - @property - def offset(self): - """ - get/set the frequency of the instance - """ - msg = ( - "{cls}.offset has been deprecated and will be removed " - "in a future version; use {cls}.freq instead.".format( - cls=type(self).__name__ - ) - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - return self.freq - - @offset.setter - def offset(self, value): - """ - get/set the frequency of the instance - """ - msg = ( - "{cls}.offset has been deprecated and will be removed " - "in a future version; use {cls}.freq instead.".format( - cls=type(self).__name__ - ) - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - self.freq = value - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - elif result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - assert isinstance(result, np.ndarray), result - return result - return type(self)(result, name=self.name) - - @property - def _box_func(self): - return lambda x: Timestamp(x, tz=self.tz) - # -------------------------------------------------------------------- @Substitution(klass="DatetimeIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_NS_DTYPE, copy=False) - else: - value = _to_M8(value, tz=self.tz) + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) - return self.values.searchsorted(value, side=side) + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) - def is_type_compatible(self, typ): + elif not isinstance(value, DatetimeArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + + return self._data.searchsorted(value, side=side) + + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make # sure we can't have ambiguous indexing return "datetime64" - @property - def is_all_dates(self): - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -1260,16 +880,21 @@ def insert(self, loc, item): ------- new_index : Index """ - if is_scalar(item) and isna(item): + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): # GH 18295 item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. timedeltat64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) - if isinstance(item, (datetime, np.datetime64)): - self._assert_can_do_op(item) - if not self._has_same_tz(item) and not isna(item): - raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: if item is NaT: @@ -1278,47 +903,21 @@ def insert(self, loc, item): freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq - item = _to_M8(item, tz=self.tz) + item = item.asm8 try: - new_dates = np.concatenate( + new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_dates, freq=freq) + return self._shallow_copy(new_i8s, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError("cannot insert DatetimeIndex with incompatible label") - - def delete(self, loc): - """ - Make a new DatetimeIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : DatetimeIndex - """ - new_dates = np.delete(self.asi8, loc) - - freq = None - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return self._shallow_copy(new_dates, freq=freq) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) def indexer_at_time(self, time, asof=False): """ @@ -1408,10 +1007,8 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_comparison_ops() DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() -DatetimeIndex._add_datetimelike_methods() def date_range( @@ -1423,8 +1020,8 @@ def date_range( normalize=False, name=None, closed=None, - **kwargs -): + **kwargs, +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex. @@ -1473,7 +1070,7 @@ def date_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1573,7 +1170,7 @@ def date_range( tz=tz, normalize=normalize, closed=closed, - **kwargs + **kwargs, ) return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) @@ -1589,8 +1186,8 @@ def bdate_range( weekmask=None, holidays=None, closed=None, - **kwargs -): + **kwargs, +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex, with business day as the default frequency. @@ -1644,7 +1241,7 @@ def bdate_range( desired. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1659,18 +1256,18 @@ def bdate_range( msg = "freq must be specified for bdate_range; use date_range instead" raise TypeError(msg) - if is_string_like(freq) and freq.startswith("C"): + if isinstance(freq, str) and freq.startswith("C"): try: weekmask = weekmask or "Mon Tue Wed Thu Fri" freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) except (KeyError, TypeError): - msg = "invalid custom frequency string: {freq}".format(freq=freq) + msg = f"invalid custom frequency string: {freq}" raise ValueError(msg) elif holidays or weekmask: msg = ( "a custom frequency string is required when holidays or " - "weekmask are passed, got frequency {freq}" - ).format(freq=freq) + f"weekmask are passed, got frequency {freq}" + ) raise ValueError(msg) return date_range( @@ -1682,7 +1279,7 @@ def bdate_range( normalize=normalize, name=name, closed=closed, - **kwargs + **kwargs, ) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py new file mode 100644 index 0000000000000..58fcce7e59be7 --- /dev/null +++ b/pandas/core/indexes/extension.py @@ -0,0 +1,242 @@ +""" +Shared methods for Index subclasses backed by ExtensionArray. +""" +from typing import List + +import numpy as np + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import ensure_platform_int, is_dtype_equal +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.arrays import ExtensionArray +from pandas.core.indexes.base import Index, deprecate_ndim_indexing +from pandas.core.ops import get_op_result_name + + +def inherit_from_data(name: str, delegate, cache: bool = False): + """ + Make an alias for a method of the underlying ExtensionArray. + + Parameters + ---------- + name : str + Name of an attribute the class should inherit from its EA parent. + delegate : class + cache : bool, default False + Whether to convert wrapped properties into cache_readonly + + Returns + ------- + attribute, method, property, or cache_readonly + """ + + attr = getattr(delegate, name) + + if isinstance(attr, property): + if cache: + method = cache_readonly(attr.fget) + + else: + + def fget(self): + return getattr(self._data, name) + + def fset(self, value): + setattr(self._data, name, value) + + fget.__name__ = name + fget.__doc__ = attr.__doc__ + + method = property(fget, fset) + + elif not callable(attr): + # just a normal attribute, no wrapping + method = attr + + else: + + def method(self, *args, **kwargs): + result = attr(self._data, *args, **kwargs) + return result + + method.__name__ = name + method.__doc__ = attr.__doc__ + return method + + +def inherit_names(names: List[str], delegate, cache: bool = False): + """ + Class decorator to pin attributes from an ExtensionArray to a Index subclass. + + Parameters + ---------- + names : List[str] + delegate : class + cache : bool, default False + """ + + def wrapper(cls): + for name in names: + meth = inherit_from_data(name, delegate, cache=cache) + setattr(cls, name, meth) + + return cls + + return wrapper + + +def _make_wrapped_comparison_op(opname): + """ + Create a comparison method that dispatches to ``._data``. + """ + + def wrapper(self, other): + if isinstance(other, ABCSeries): + # the arrays defer to Series for comparison ops but the indexes + # don't, so we have to unwrap here. + other = other._values + + other = _maybe_unwrap_index(other) + + op = getattr(self._data, opname) + return op(other) + + wrapper.__name__ = opname + return wrapper + + +def make_wrapped_arith_op(opname): + def method(self, other): + meth = getattr(self._data, opname) + result = meth(_maybe_unwrap_index(other)) + return _wrap_arithmetic_op(self, other, result) + + method.__name__ = opname + return method + + +def _wrap_arithmetic_op(self, other, result): + if result is NotImplemented: + return NotImplemented + + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return ( + _wrap_arithmetic_op(self, other, result[0]), + _wrap_arithmetic_op(self, other, result[1]), + ) + + if not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + + res_name = get_op_result_name(self, other) + result.name = res_name + return result + + +def _maybe_unwrap_index(obj): + """ + If operating against another Index object, we need to unwrap the underlying + data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray + implementation, otherwise we will incorrectly return NotImplemented. + + Parameters + ---------- + obj : object + + Returns + ------- + unwrapped object + """ + if isinstance(obj, Index): + return obj._data + return obj + + +class ExtensionIndex(Index): + """ + Index subclass for indexes backed by ExtensionArray. + """ + + _data: ExtensionArray + + __eq__ = _make_wrapped_comparison_op("__eq__") + __ne__ = _make_wrapped_comparison_op("__ne__") + __lt__ = _make_wrapped_comparison_op("__lt__") + __gt__ = _make_wrapped_comparison_op("__gt__") + __le__ = _make_wrapped_comparison_op("__le__") + __ge__ = _make_wrapped_comparison_op("__ge__") + + def __getitem__(self, key): + result = self._data[key] + if isinstance(result, type(self._data)): + return type(self)(result, name=self.name) + + # Includes cases where we get a 2D ndarray back for MPL compat + deprecate_ndim_indexing(result) + return result + + def __iter__(self): + return self._data.__iter__() + + @property + def _ndarray_values(self) -> np.ndarray: + return self._data._ndarray_values + + def dropna(self, how="any"): + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + return self._shallow_copy(self._data[~self._isnan]) + return self._shallow_copy() + + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + result = self._data.repeat(repeats, axis=axis) + return self._shallow_copy(result) + + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + + taken = self._assert_take_fillable( + self._data, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + return type(self)(taken, name=self.name) + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + result = self._data.unique() + return self._shallow_copy(result) + + def _get_unique_index(self, dropna=False): + if self.is_unique and not dropna: + return self + + result = self._data.unique() + if dropna and self.hasnans: + result = result[~result.isna()] + return self._shallow_copy(result) + + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype) and copy is False: + # Ensure that self.astype(self.dtype) is self + return self + + new_values = self._data.astype(dtype, copy=copy) + + # pass copy=False because any copying will be done in the + # _data.astype call above + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index a6c39d049c50c..909643d50e9d7 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -4,17 +4,10 @@ These are used for: - .names (FrozenList) -- .levels & .codes (FrozenNDArray) """ -import warnings - -import numpy as np - -from pandas.util._decorators import deprecate_kwarg - -from pandas.core.dtypes.cast import coerce_indexer_dtype +from typing import Any from pandas.core.base import PandasObject @@ -31,7 +24,7 @@ class FrozenList(PandasObject, list): # Side note: This has to be of type list. Otherwise, # it messes up PyTables type checks. - def union(self, other): + def union(self, other) -> "FrozenList": """ Returns a FrozenList with other concatenated to the end of self. @@ -42,14 +35,14 @@ def union(self, other): Returns ------- - diff : FrozenList + FrozenList The collection difference between self and other. """ if isinstance(other, tuple): other = list(other) return type(self)(super().__add__(other)) - def difference(self, other): + def difference(self, other) -> "FrozenList": """ Returns a FrozenList with elements from other removed from self. @@ -60,7 +53,7 @@ def difference(self, other): Returns ------- - diff : FrozenList + FrozenList The collection difference between self and other. """ other = set(other) @@ -72,15 +65,15 @@ def difference(self, other): def __getitem__(self, n): if isinstance(n, slice): - return self.__class__(super().__getitem__(n)) + return type(self)(super().__getitem__(n)) return super().__getitem__(n) def __radd__(self, other): if isinstance(other, tuple): other = list(other) - return self.__class__(other + list(self)) + return type(self)(other + list(self)) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, (tuple, FrozenList)): other = list(other) return super().__eq__(other) @@ -88,100 +81,27 @@ def __eq__(self, other): __req__ = __eq__ def __mul__(self, other): - return self.__class__(super().__mul__(other)) + return type(self)(super().__mul__(other)) __imul__ = __mul__ def __reduce__(self): - return self.__class__, (list(self),) + return type(self), (list(self),) def __hash__(self): return hash(tuple(self)) def _disabled(self, *args, **kwargs): - """This method will not function because object is immutable.""" - raise TypeError( - "'%s' does not support mutable operations." % self.__class__.__name__ - ) + """ + This method will not function because object is immutable. + """ + raise TypeError(f"'{type(self).__name__}' does not support mutable operations.") - def __str__(self): + def __str__(self) -> str: return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) - def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, str(self)) + def __repr__(self) -> str: + return f"{type(self).__name__}({str(self)})" __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled - - -class FrozenNDArray(PandasObject, np.ndarray): - - # no __array_finalize__ for now because no metadata - def __new__(cls, data, dtype=None, copy=False): - warnings.warn( - "\nFrozenNDArray is deprecated and will be removed in a " - "future version.\nPlease use `numpy.ndarray` instead.\n", - FutureWarning, - stacklevel=2, - ) - - if copy is None: - copy = not isinstance(data, FrozenNDArray) - res = np.array(data, dtype=dtype, copy=copy).view(cls) - return res - - def _disabled(self, *args, **kwargs): - """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % self.__class__) - - __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled - put = itemset = fill = _disabled - - def _shallow_copy(self): - return self.view() - - def values(self): - """returns *copy* of underlying array""" - arr = self.view(np.ndarray).copy() - return arr - - def __repr__(self): - """ - Return a string representation for this object. - """ - prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) - return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) - - @deprecate_kwarg(old_arg_name="v", new_arg_name="value") - def searchsorted(self, value, side="left", sorter=None): - """ - Find indices to insert `value` so as to maintain order. - - For full documentation, see `numpy.searchsorted` - - See Also - -------- - numpy.searchsorted : Equivalent function. - """ - - # We are much more performant if the searched - # indexer is the same type as the array. - # - # This doesn't matter for int64, but DOES - # matter for smaller int dtypes. - # - # xref: https://github.com/numpy/numpy/issues/5370 - try: - value = self.dtype.type(value) - except ValueError: - pass - - return super().searchsorted(value, side=side, sorter=sorter) - - -def _ensure_frozen(array_like, categories, copy=False): - array_like = coerce_indexer_dtype(array_like, categories) - array_like = array_like.view(FrozenNDArray) - if copy: - array_like = array_like.copy() - return array_like diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c9554016630cd..1c86235f9eaa1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -2,7 +2,6 @@ from operator import le, lt import textwrap from typing import Any, Optional, Tuple, Union -import warnings import numpy as np @@ -10,6 +9,7 @@ from pandas._libs import Timedelta, Timestamp, lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree +from pandas._typing import AnyArrayLike from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -20,6 +20,7 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, + is_categorical, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -36,7 +37,8 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna -from pandas._typing import AnyArrayLike +from pandas.core import accessor +from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase @@ -46,8 +48,10 @@ _index_shared_docs, default_pprint, ensure_index, + maybe_extract_name, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.extension import ExtensionIndex, inherit_names from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name @@ -84,9 +88,7 @@ def _get_next_label(label): elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: - raise TypeError( - "cannot determine next label for type {typ!r}".format(typ=type(label)) - ) + raise TypeError(f"cannot determine next label for type {repr(type(label))}") def _get_prev_label(label): @@ -100,28 +102,13 @@ def _get_prev_label(label): elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: - raise TypeError( - "cannot determine next label for type {typ!r}".format(typ=type(label)) - ) - - -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right + raise TypeError(f"cannot determine next label for type {repr(type(label))}") def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have - arguments and breaks __new__ + arguments and breaks __new__. """ return cls.from_arrays(**d) @@ -146,21 +133,19 @@ def func(intvidx_self, other, sort=False): result = result.astype(intvidx_self.dtype) return result elif intvidx_self.closed != other.closed: - msg = ( + raise ValueError( "can only do set operations between two IntervalIndex " "objects that are closed on the same side" ) - raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): - msg = ( - "can only do {op} between two IntervalIndex " + raise TypeError( + f"can only do {self.op_name} between two IntervalIndex " "objects that have compatible dtypes" ) - raise TypeError(msg.format(op=self.op_name)) return setop(intvidx_self, other, sort) @@ -198,7 +183,31 @@ def func(intvidx_self, other, sort=False): ), ) ) -class IntervalIndex(IntervalMixin, Index): +@accessor.delegate_names( + delegate=IntervalArray, + accessors=["length", "size", "left", "right", "mid", "closed", "dtype"], + typ="property", + overwrite=True, +) +@accessor.delegate_names( + delegate=IntervalArray, + accessors=[ + "__array__", + "overlaps", + "contains", + "__len__", + "set_closed", + "to_tuples", + ], + typ="method", + overwrite=True, +) +@inherit_names( + ["is_non_overlapping_monotonic", "mid", "_ndarray_values"], + IntervalArray, + cache=True, +) +class IntervalIndex(IntervalMixin, ExtensionIndex, accessor.PandasDelegate): _typ = "intervalindex" _comparables = ["name"] _attributes = ["name", "closed"] @@ -209,15 +218,22 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + _raw_inherit = {"__array__", "overlaps", "contains"} + # -------------------------------------------------------------------- # Constructors def __new__( - cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True + cls, + data, + closed=None, + dtype=None, + copy: bool = False, + name=None, + verify_integrity: bool = True, ): - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray( @@ -246,6 +262,7 @@ def _simple_new(cls, array, name, closed=None): result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._no_setting_name = False result._reset_identity() return result @@ -266,7 +283,9 @@ def _simple_new(cls, array, name, closed=None): ), ) ) - def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): + def from_breaks( + cls, breaks, closed: str = "right", name=None, copy: bool = False, dtype=None + ): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_breaks( breaks, closed=closed, copy=copy, dtype=dtype @@ -291,7 +310,13 @@ def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): ) ) def from_arrays( - cls, left, right, closed="right", name=None, copy=False, dtype=None + cls, + left, + right, + closed: str = "right", + name=None, + copy: bool = False, + dtype=None, ): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_arrays( @@ -316,7 +341,9 @@ def from_arrays( ), ) ) - def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): + def from_tuples( + cls, data, closed: str = "right", name=None, copy: bool = False, dtype=None + ): with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) @@ -332,7 +359,9 @@ def _shallow_copy(self, left=None, right=None, **kwargs): @cache_readonly def _isnan(self): - """Return a mask indicating if each value is NA""" + """ + Return a mask indicating if each value is NA. + """ if self._mask is None: self._mask = isna(self.left) return self._mask @@ -343,7 +372,7 @@ def _engine(self): right = self._maybe_convert_i8(self.right) return IntervalTree(left, right, closed=self.closed) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ return a boolean if this key is IN the index We *only* accept an Interval @@ -354,7 +383,7 @@ def __contains__(self, key): Returns ------- - boolean + bool """ if not isinstance(key, Interval): return False @@ -365,112 +394,10 @@ def __contains__(self, key): except KeyError: return False - @Appender( - _interval_shared_docs["to_tuples"] - % dict( - return_type="Index", - examples=""" - Examples - -------- - >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) - >>> idx.to_tuples() - Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') - >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') - """, - ) - ) - def to_tuples(self, na_tuple=True): - tuples = self._data.to_tuples(na_tuple=na_tuple) - return Index(tuples) - @cache_readonly def _multiindex(self): return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - @property - def left(self): - """ - Return the left endpoints of each Interval in the IntervalIndex as - an Index - """ - return self._data._left - - @property - def right(self): - """ - Return the right endpoints of each Interval in the IntervalIndex as - an Index - """ - return self._data._right - - @property - def closed(self): - """ - Whether the intervals are closed on the left-side, right-side, both or - neither - """ - return self._data._closed - - @Appender( - _interval_shared_docs["set_closed"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - Examples - -------- - >>> index = pd.interval_range(0, 3) - >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') - >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]], - closed='both', - dtype='interval[int64]') - """ - ), - ) - ) - def set_closed(self, closed): - if closed not in _VALID_CLOSED: - msg = "invalid option for 'closed': {closed}" - raise ValueError(msg.format(closed=closed)) - - # return self._shallow_copy(closed=closed) - array = self._data.set_closed(closed) - return self._simple_new(array, self.name) - - @property - def length(self): - """ - Return an Index with entries denoting the length of each Interval in - the IntervalIndex - """ - return self._data.length - - @property - def size(self): - # Avoid materializing ndarray[Interval] - return self._data.size - - @property - def itemsize(self): - msg = ( - "IntervalIndex.itemsize is deprecated and will be removed in " - "a future version" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - - # suppress the warning from the underlying left/right itemsize - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return self.left.itemsize + self.right.itemsize - - def __len__(self): - return len(self.left) - @cache_readonly def values(self): """ @@ -482,14 +409,6 @@ def values(self): def _values(self): return self._data - @cache_readonly - def _ndarray_values(self): - return np.array(self._data) - - def __array__(self, result=None): - """ the array interface, return my values """ - return self._ndarray_values - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result @@ -497,68 +416,31 @@ def __array_wrap__(self, result, context=None): def __reduce__(self): d = dict(left=self.left, right=self.right) d.update(self._get_attributes_dict()) - return _new_IntervalIndex, (self.__class__, d), None - - @Appender(_index_shared_docs["copy"]) - def copy(self, deep=False, name=None): - array = self._data - if deep: - array = array.copy() - attributes = self._get_attributes_dict() - if name is not None: - attributes.update(name=name) - - return self._simple_new(array, **attributes) + return _new_IntervalIndex, (type(self), d), None @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): - with rewrite_exception("IntervalArray", self.__class__.__name__): + with rewrite_exception("IntervalArray", type(self).__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) - return super().astype(dtype, copy=copy) - - @cache_readonly - def dtype(self): - """Return the dtype object of the underlying data""" - return self._data.dtype + return Index.astype(self, dtype, copy=copy) @property - def inferred_type(self): + def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) + # IntervalTree doesn't have a is_monotonic_decreasing, so have to override + # the Index implemenation @cache_readonly - def mid(self): - """ - Return the midpoint of each Interval in the IntervalIndex as an Index - """ - return self._data.mid - - @cache_readonly - def is_monotonic(self): - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self.is_monotonic_increasing - - @cache_readonly - def is_monotonic_increasing(self): - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self._engine.is_monotonic_increasing - - @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return True if the IntervalIndex is monotonic decreasing (only equal or decreasing values), else False @@ -568,7 +450,7 @@ def is_monotonic_decreasing(self): @cache_readonly def is_unique(self): """ - Return True if the IntervalIndex contains unique elements, else False + Return True if the IntervalIndex contains unique elements, else False. """ left = self.left right = self.right @@ -589,11 +471,6 @@ def is_unique(self): return True - @cache_readonly - @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs) - def is_non_overlapping_monotonic(self): - return self._data.is_non_overlapping_monotonic - @property def is_overlapping(self): """ @@ -672,26 +549,6 @@ def _convert_list_indexer(self, keyarr, kind=None): return locs - def _maybe_cast_indexed(self, key): - """ - we need to cast the key, which could be a scalar - or an array-like to the type of our subtype - """ - if isinstance(key, IntervalIndex): - return key - - subtype = self.dtype.subtype - if is_float_dtype(subtype): - if is_integer(key): - key = float(key) - elif isinstance(key, (np.ndarray, Index)): - key = key.astype("float64") - elif is_integer_dtype(subtype): - if is_integer(key): - key = int(key) - - return key - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -725,7 +582,7 @@ def _needs_i8_conversion(self, key): Returns ------- - boolean + bool """ if is_interval_dtype(key) or isinstance(key, Interval): return self._needs_i8_conversion(key.left) @@ -746,7 +603,7 @@ def _maybe_convert_i8(self, key): Returns ------- - key: scalar or list-like + scalar or list-like The original key if no conversion occurred, int if converted scalar, Int64Index if converted list-like. """ @@ -778,12 +635,12 @@ def _maybe_convert_i8(self, key): # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype - msg = ( - "Cannot index an IntervalIndex of subtype {subtype} with " - "values of dtype {other}" - ) + if not is_dtype_equal(subtype, key_dtype): - raise ValueError(msg.format(subtype=subtype, other=key_dtype)) + raise ValueError( + f"Cannot index an IntervalIndex of subtype {subtype} with " + f"values of dtype {key_dtype}" + ) return key_i8 @@ -792,22 +649,21 @@ def _check_method(self, method): return if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - msg = "method {method} not yet implemented for IntervalIndex" - raise NotImplementedError(msg.format(method=method)) + raise NotImplementedError( + f"method {method} not yet implemented for IntervalIndex" + ) raise ValueError("Invalid fill method") def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: raise KeyError( - "can only get slices from an IntervalIndex if " - "bounds are non-overlapping and all monotonic " - "increasing or decreasing" + "can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing" ) if isinstance(label, IntervalMixin): - msg = "Interval objects are not currently supported" - raise NotImplementedError(msg) + raise NotImplementedError("Interval objects are not currently supported") # GH 20921: "not is_monotonic_increasing" for the second condition # instead of "is_monotonic_decreasing" to account for single element @@ -825,34 +681,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, "left", exclude_label=key.open_left - ) - stop = self._searchsorted_monotonic( - key.right, "right", exclude_label=key.open_right - ) - elif isinstance(key, slice): - # slice - start, stop = key.start, key.stop - if (key.step or 1) != 1: - raise NotImplementedError("cannot slice with a slice step") - if start is None: - start = 0 - else: - start = self._searchsorted_monotonic(start, "left") - if stop is None: - stop = len(self) - else: - stop = self._searchsorted_monotonic(stop, "right") - else: - # scalar or index-like - - start = self._searchsorted_monotonic(key, "left") - stop = self._searchsorted_monotonic(key, "right") - return start, stop - def get_loc( self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -867,7 +695,7 @@ def get_loc( Returns ------- - loc : int if unique index, slice if monotonic index, else mask + int if unique index, slice if monotonic index, else mask Examples -------- @@ -935,7 +763,7 @@ def get_loc( None is specified as these are not yet implemented. """ ) - } + }, ) ) @Appender(_index_shared_docs["get_indexer"]) @@ -950,11 +778,10 @@ def get_indexer( self._check_method(method) if self.is_overlapping: - msg = ( - "cannot handle overlapping indices; use " - "IntervalIndex.get_indexer_non_unique" + raise InvalidIndexError( + "cannot handle overlapping indices; " + "use IntervalIndex.get_indexer_non_unique" ) - raise InvalidIndexError(msg) target_as_index = ensure_index(target) @@ -976,6 +803,10 @@ def get_indexer( left_indexer = self.left.get_indexer(target_as_index.left) right_indexer = self.right.get_indexer(target_as_index.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) + elif is_categorical(target_as_index): + # get an indexer for unique categories then propagate to codes via take_1d + categories_indexer = self.get_indexer(target_as_index.categories) + indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) elif not is_object_dtype(target_as_index): # homogeneous scalar index: use IntervalTree target_as_index = self._maybe_convert_i8(target_as_index) @@ -1084,7 +915,7 @@ def delete(self, loc): Returns ------- - new_index : IntervalIndex + IntervalIndex """ new_left = self.left.delete(loc) new_right = self.right.delete(loc) @@ -1103,7 +934,7 @@ def insert(self, loc, item): Returns ------- - new_index : IntervalIndex + IntervalIndex """ if isinstance(item, Interval): if item.closed != self.closed: @@ -1130,11 +961,10 @@ def _concat_same_dtype(self, to_concat, name): we allow a 0-len index here as well """ if not len({i.closed for i in to_concat if len(i)}) == 1: - msg = ( + raise ValueError( "can only append two IntervalIndex objects " "that are closed on the same side" ) - raise ValueError(msg) return super()._concat_same_dtype(to_concat, name) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) @@ -1142,8 +972,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): result = self._data.take( indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs ) - attributes = self._get_attributes_dict() - return self._simple_new(result, **attributes) + return self._shallow_copy(result) def __getitem__(self, value): result = self._data[value] @@ -1177,23 +1006,24 @@ def _format_data(self, name=None): summary = "[]" elif n == 1: first = formatter(self[0]) - summary = "[{first}]".format(first=first) + summary = f"[{first}]" elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = "[{first}, {last}]".format(first=first, last=last) + summary = f"[{first}, {last}]" else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = "[{head} ... {tail}]".format( - head=", ".join(head), tail=", ".join(tail) - ) + head_joined = ", ".join(head) + tail_joined = ", ".join(tail) + summary = f"[{head_joined} ... {tail_joined}]" else: tail = [formatter(x) for x in self] - summary = "[{tail}]".format(tail=", ".join(tail)) + joined = ", ".join(tail) + summary = f"[{joined}]" return summary + "," + self._format_space() @@ -1201,21 +1031,21 @@ def _format_attrs(self): attrs = [("closed", repr(self.closed))] if self.name is not None: attrs.append(("name", default_pprint(self.name))) - attrs.append(("dtype", "'{dtype}'".format(dtype=self.dtype))) + attrs.append(("dtype", f"'{self.dtype}'")) return attrs - def _format_space(self): - space = " " * (len(self.__class__.__name__) + 1) - return "\n{space}".format(space=space) + def _format_space(self) -> str: + space = " " * (len(type(self).__name__) + 1) + return f"\n{space}" # -------------------------------------------------------------------- def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) - def equals(self, other): + def equals(self, other) -> bool: """ - Determines if two IntervalIndex objects contain the same elements + Determines if two IntervalIndex objects contain the same elements. """ if self.is_(other): return True @@ -1233,44 +1063,6 @@ def equals(self, other): and self.closed == other.closed ) - @Appender( - _interval_shared_docs["contains"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - >>> intervals.contains(0.5) - array([ True, False, False]) - """ - ), - ) - ) - def contains(self, other): - return self._data.contains(other) - - @Appender( - _interval_shared_docs["overlaps"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - """ - ), - ) - ) - def overlaps(self, other): - return self._data.overlaps(other) - @Appender(_index_shared_docs["intersection"]) @SetopCheck(op_name="intersection") def intersection( @@ -1303,7 +1095,7 @@ def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": Returns ------- - taken : IntervalIndex + IntervalIndex """ lindexer = self.left.get_indexer(other.left) rindexer = self.right.get_indexer(other.right) @@ -1325,7 +1117,7 @@ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": Returns ------- - taken : IntervalIndex + IntervalIndex """ mask = np.zeros(len(self), dtype=bool) @@ -1357,7 +1149,7 @@ def func(self, other, sort=sort): return func @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself @@ -1370,12 +1162,42 @@ def is_all_dates(self): # TODO: arithmetic operations + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._data, name) + return prop # no wrapping for now + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the ._data """ + method = getattr(self._data, name) + res = method(*args, **kwargs) + if is_scalar(res) or name in self._raw_inherit: + return res + if isinstance(res, IntervalArray): + return type(self)._simple_new(res, name=self.name) + return Index(res) + + # GH#30817 until IntervalArray implements inequalities, get them from Index + def __lt__(self, other): + return Index.__lt__(self, other) + + def __le__(self, other): + return Index.__le__(self, other) + + def __gt__(self, other): + return Index.__gt__(self, other) + + def __ge__(self, other): + return Index.__ge__(self, other) + IntervalIndex._add_logical_methods_disabled() -def _is_valid_endpoint(endpoint): - """helper for interval_range to check if start/end are valid types""" +def _is_valid_endpoint(endpoint) -> bool: + """ + Helper for interval_range to check if start/end are valid types. + """ return any( [ is_number(endpoint), @@ -1386,8 +1208,10 @@ def _is_valid_endpoint(endpoint): ) -def _is_type_compatible(a, b): - """helper for interval_range to check type compat of start/end/freq""" +def _is_type_compatible(a, b) -> bool: + """ + Helper for interval_range to check type compat of start/end/freq. + """ is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) return ( @@ -1407,24 +1231,24 @@ def interval_range( Parameters ---------- start : numeric or datetime-like, default None - Left bound for generating intervals + Left bound for generating intervals. end : numeric or datetime-like, default None - Right bound for generating intervals + Right bound for generating intervals. periods : int, default None - Number of periods to generate + Number of periods to generate. freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : str, default None - Name of the resulting IntervalIndex + Name of the resulting IntervalIndex. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- - rng : IntervalIndex + IntervalIndex See Also -------- @@ -1438,7 +1262,7 @@ def interval_range( ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1502,25 +1326,21 @@ def interval_range( ) if not _is_valid_endpoint(start): - msg = "start must be numeric or datetime-like, got {start}" - raise ValueError(msg.format(start=start)) + raise ValueError(f"start must be numeric or datetime-like, got {start}") elif not _is_valid_endpoint(end): - msg = "end must be numeric or datetime-like, got {end}" - raise ValueError(msg.format(end=end)) + raise ValueError(f"end must be numeric or datetime-like, got {end}") if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: - msg = "periods must be a number, got {periods}" - raise TypeError(msg.format(periods=periods)) + raise TypeError(f"periods must be a number, got {periods}") if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError: raise ValueError( - "freq must be numeric or convertible to " - "DateOffset, got {freq}".format(freq=freq) + f"freq must be numeric or convertible to DateOffset, got {freq}" ) # verify type compatibility diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dc2abfb0cb6eb..84d7399cc4f2d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,6 @@ -from collections import OrderedDict import datetime from sys import getsizeof +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -11,8 +11,9 @@ from pandas._libs.hashtable import duplicated_int64 from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError -from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util._decorators import Appender, cache_readonly +from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -31,7 +32,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import Categorical -from pandas.core.arrays.categorical import _factorize_from_iterables +from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( @@ -40,7 +41,7 @@ _index_shared_docs, ensure_index, ) -from pandas.core.indexes.frozen import FrozenList, _ensure_frozen +from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -60,8 +61,6 @@ dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") ) -_no_default_names = object() - class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): """ @@ -84,7 +83,7 @@ def _codes_to_ints(self, codes): Returns ------- - int_keys : scalar or 1-dimensional array, of dtype uint64 + scalar or 1-dimensional array, of dtype uint64 Integer(s) representing one combination (each). """ # Shift the representation of each level by the pre-calculated number @@ -124,7 +123,7 @@ def _codes_to_ints(self, codes): Returns ------- - int_keys : int, or 1-dimensional array of dtype object + int, or 1-dimensional array of dtype object Integer(s) representing one combination (each). """ @@ -156,11 +155,6 @@ class MultiIndex(Index): Integers for each level designating which label at each location. .. versionadded:: 0.24.0 - labels : sequence of arrays - Integers for each level designating which label at each location. - - .. deprecated:: 0.24.0 - Use ``codes`` instead sortorder : optional int Level of sortedness (must be lexicographically sorted by that level). @@ -195,6 +189,7 @@ class MultiIndex(Index): swaplevel reorder_levels remove_unused_levels + get_locs See Also -------- @@ -208,7 +203,7 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -229,9 +224,7 @@ class MultiIndex(Index): of the mentioned helper methods. """ - _deprecations = Index._deprecations | frozenset( - ["labels", "set_labels", "to_hierarchical"] - ) + _deprecations = Index._deprecations | frozenset() # initialize to zero-length tuples to make everything work _typ = "multiindex" @@ -244,7 +237,6 @@ class MultiIndex(Index): # -------------------------------------------------------------------- # Constructors - @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") def __new__( cls, levels=None, @@ -254,8 +246,8 @@ def __new__( dtype=None, copy=False, name=None, - verify_integrity=True, - _set_identity=True, + verify_integrity: bool = True, + _set_identity: bool = True, ): # compat with Index @@ -293,7 +285,7 @@ def __new__( return result - def _validate_codes(self, level: list, code: list): + def _validate_codes(self, level: List, code: List): """ Reassign code values as -1 if their corresponding levels are NaN. @@ -306,7 +298,7 @@ def _validate_codes(self, level: list, code: list): Returns ------- - code : new code where code value = -1 if it corresponds + new code where code value = -1 if it corresponds to a level with missing values (NaN, NaT, None). """ null_mask = isna(level) @@ -314,9 +306,10 @@ def _validate_codes(self, level: list, code: list): code = np.where(null_mask[code], -1, code) return code - def _verify_integrity(self, codes=None, levels=None): + def _verify_integrity( + self, codes: Optional[List] = None, levels: Optional[List] = None + ): """ - Parameters ---------- codes : optional list @@ -332,7 +325,7 @@ def _verify_integrity(self, codes=None, levels=None): Returns ------- - codes : new codes where code value = -1 if it corresponds to a + new codes where code value = -1 if it corresponds to a NaN level. """ # NOTE: Currently does not check, among other things, that cached @@ -342,42 +335,33 @@ def _verify_integrity(self, codes=None, levels=None): if len(levels) != len(codes): raise ValueError( - "Length of levels and codes must match. NOTE:" - " this index is in an inconsistent state." + "Length of levels and codes must match. NOTE: " + "this index is in an inconsistent state." ) codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: raise ValueError( - "Unequal code lengths: %s" % ([len(code_) for code_ in codes]) + f"Unequal code lengths: {[len(code_) for code_ in codes]}" ) if len(level_codes) and level_codes.max() >= len(level): - msg = ( - "On level {level}, code max ({max_code}) >= length of " - "level ({level_len}). NOTE: this index is in an " - "inconsistent state".format( - level=i, max_code=level_codes.max(), level_len=len(level) - ) - ) - raise ValueError(msg) - if len(level_codes) and level_codes.min() < -1: raise ValueError( - "On level {level}, code value ({code})" - " < -1".format(level=i, code=level_codes.min()) + f"On level {i}, code max ({level_codes.max()}) >= length of " + f"level ({len(level)}). NOTE: this index is in an " + "inconsistent state" ) + if len(level_codes) and level_codes.min() < -1: + raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1") if not level.is_unique: raise ValueError( - "Level values must be unique: {values} on " - "level {level}".format(values=[value for value in level], level=i) + f"Level values must be unique: {list(level)} on level {i}" ) if self.sortorder is not None: if self.sortorder > self._lexsort_depth(): raise ValueError( - "Value for sortorder must be inferior or equal " - "to actual lexsort_depth: " - "sortorder {sortorder} with lexsort_depth {lexsort_depth}".format( - sortorder=self.sortorder, lexsort_depth=self._lexsort_depth() - ) + "Value for sortorder must be inferior or equal to actual " + f"lexsort_depth: sortorder {self.sortorder} " + f"with lexsort_depth {self._lexsort_depth()}" ) codes = [ @@ -387,7 +371,7 @@ def _verify_integrity(self, codes=None, levels=None): return new_codes @classmethod - def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default): """ Convert arrays to MultiIndex. @@ -404,7 +388,7 @@ def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): Returns ------- - index : MultiIndex + MultiIndex See Also -------- @@ -440,8 +424,8 @@ def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): if len(arrays[i]) != len(arrays[i - 1]): raise ValueError("all arrays must be same length") - codes, levels = _factorize_from_iterables(arrays) - if names is _no_default_names: + codes, levels = factorize_from_iterables(arrays) + if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] return MultiIndex( @@ -469,7 +453,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): Returns ------- - index : MultiIndex + MultiIndex See Also -------- @@ -496,8 +480,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): if len(tuples) == 0: if names is None: - msg = "Cannot infer number of levels from empty list" - raise TypeError(msg) + raise TypeError("Cannot infer number of levels from empty list") arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): @@ -512,7 +495,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod - def from_product(cls, iterables, sortorder=None, names=_no_default_names): + def from_product(cls, iterables, sortorder=None, names=lib.no_default): """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -533,7 +516,7 @@ def from_product(cls, iterables, sortorder=None, names=_no_default_names): Returns ------- - index : MultiIndex + MultiIndex See Also -------- @@ -562,8 +545,8 @@ def from_product(cls, iterables, sortorder=None, names=_no_default_names): elif is_iterator(iterables): iterables = list(iterables) - codes, levels = _factorize_from_iterables(iterables) - if names is _no_default_names: + codes, levels = factorize_from_iterables(iterables) + if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] codes = cartesian_product(codes) @@ -642,6 +625,9 @@ def levels(self): result = [ x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) ] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True return FrozenList(result) @property @@ -668,35 +654,10 @@ def array(self): ------ ValueError """ - msg = ( + raise ValueError( "MultiIndex has no single backing array. Use " "'MultiIndex.to_numpy()' to get a NumPy array of tuples." ) - raise ValueError(msg) - - @property - def _is_homogeneous_type(self): - """Whether the levels of a MultiIndex all have the same dtype. - - This looks at the dtypes of the levels. - - See Also - -------- - Index._is_homogeneous_type : Whether the object has a single - dtype. - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - - Examples - -------- - >>> MultiIndex.from_tuples([ - ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type - True - >>> MultiIndex.from_tuples([ - ... ('a', 1), ('a', 2)])._is_homogeneous_type - False - """ - return len({x.dtype for x in self.levels}) <= 1 def _set_levels( self, levels, level=None, copy=False, validate=True, verify_integrity=False @@ -704,22 +665,23 @@ def _set_levels( # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly - if validate and len(levels) == 0: - raise ValueError("Must set non-zero number of levels.") - if validate and level is None and len(levels) != self.nlevels: - raise ValueError("Length of levels must match number of levels.") - if validate and level is not None and len(levels) != len(level): - raise ValueError("Length of levels must match length of level.") + if validate: + if len(levels) == 0: + raise ValueError("Must set non-zero number of levels.") + if level is None and len(levels) != self.nlevels: + raise ValueError("Length of levels must match number of levels.") + if level is not None and len(levels) != len(level): + raise ValueError("Length of levels must match length of level.") if level is None: new_levels = FrozenList( ensure_index(lev, copy=copy)._shallow_copy() for lev in levels ) else: - level = [self._get_level_number(l) for l in level] + level_numbers = [self._get_level_number(lev) for lev in level] new_levels = list(self._levels) - for l, v in zip(level, levels): - new_levels[l] = ensure_index(v, copy=copy)._shallow_copy() + for lev_num, lev in zip(level_numbers, levels): + new_levels[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() new_levels = FrozenList(new_levels) if verify_integrity: @@ -736,19 +698,18 @@ def _set_levels( def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ - Set new levels on MultiIndex. Defaults to returning - new index. + Set new levels on MultiIndex. Defaults to returning new index. Parameters ---------- levels : sequence or list of sequence - new level(s) to apply + New level(s) to apply. level : int, level name, or sequence of int/level names (default None) - level(s) to set (None for all levels) + Level(s) to set (None for all levels). inplace : bool - if True, mutates in place - verify_integrity : bool (default True) - if True, checks that levels and codes are compatible + If True, mutates in place. + verify_integrity : bool, default True + If True, checks that levels and codes are compatible. Returns ------- @@ -757,32 +718,47 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), - ('b', 2)], + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level=0) + >>> idx.set_levels(['a', 'b', 'c'], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')], + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], names=['foo', 'bar']) >>> idx.set_levels(['a', 'b'], level='bar') MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), - (2, 'b')], + (2, 'b'), + (3, 'a'), + (3, 'b')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -813,36 +789,26 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): def codes(self): return self._codes - @property - def labels(self): - warnings.warn( - (".labels was deprecated in version 0.24.0. Use .codes instead."), - FutureWarning, - stacklevel=2, - ) - return self.codes - def _set_codes( self, codes, level=None, copy=False, validate=True, verify_integrity=False ): - if validate and level is None and len(codes) != self.nlevels: - raise ValueError("Length of codes must match number of levels") - if validate and level is not None and len(codes) != len(level): - raise ValueError("Length of codes must match length of levels.") + if validate: + if level is None and len(codes) != self.nlevels: + raise ValueError("Length of codes must match number of levels") + if level is not None and len(codes) != len(level): + raise ValueError("Length of codes must match length of levels.") if level is None: new_codes = FrozenList( - _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() + _coerce_indexer_frozen(level_codes, lev, copy=copy).view() for lev, level_codes in zip(self._levels, codes) ) else: - level = [self._get_level_number(l) for l in level] + level_numbers = [self._get_level_number(lev) for lev in level] new_codes = list(self._codes) - for lev_idx, level_codes in zip(level, codes): - lev = self.levels[lev_idx] - new_codes[lev_idx] = _ensure_frozen( - level_codes, lev, copy=copy - )._shallow_copy() + for lev_num, level_codes in zip(level_numbers, codes): + lev = self.levels[lev_num] + new_codes[lev_num] = _coerce_indexer_frozen(level_codes, lev, copy=copy) new_codes = FrozenList(new_codes) if verify_integrity: @@ -853,23 +819,6 @@ def _set_codes( self._tuples = None self._reset_cache() - def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): - warnings.warn( - ( - ".set_labels was deprecated in version 0.24.0. " - "Use .set_codes instead." - ), - FutureWarning, - stacklevel=2, - ) - return self.set_codes( - codes=labels, - level=level, - inplace=inplace, - verify_integrity=verify_integrity, - ) - - @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning @@ -882,13 +831,13 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Parameters ---------- codes : sequence or list of sequence - new codes to apply + New codes to apply. level : int, level name, or sequence of int/level names (default None) - level(s) to set (None for all levels) + Level(s) to set (None for all levels). inplace : bool - if True, mutates in place + If True, mutates in place. verify_integrity : bool (default True) - if True, checks that levels and codes are compatible + If True, checks that levels and codes are compatible. Returns ------- @@ -946,7 +895,6 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): if not inplace: return idx - @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") def copy( self, names=None, @@ -955,7 +903,7 @@ def copy( codes=None, deep=False, _set_identity=False, - **kwargs + **kwargs, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -980,7 +928,8 @@ def copy( """ name = kwargs.get("name") names = self._validate_names(name=name, names=names, deep=deep) - + if "labels" in kwargs: + raise TypeError("'labels' argument has been removed; use 'codes' instead") if deep: from copy import deepcopy @@ -1002,7 +951,7 @@ def copy( _set_identity=_set_identity, ) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return self.values @@ -1019,12 +968,12 @@ def _shallow_copy_with_infer(self, values, **kwargs): return MultiIndex( levels=[[] for _ in range(self.nlevels)], codes=[[] for _ in range(self.nlevels)], - **kwargs + **kwargs, ) return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key) -> bool: hash(key) try: self.get_loc(key) @@ -1042,10 +991,10 @@ def _shallow_copy(self, values=None, **kwargs): return self.copy(**kwargs) @cache_readonly - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype("O") - def _is_memory_usage_qualified(self): + def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ def f(l): @@ -1054,18 +1003,18 @@ def f(l): return any(f(l) for l in self._inferred_type_levels) @Appender(Index.memory_usage.__doc__) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize # a tuple representation unnecessarily return self._nbytes(deep) @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ return the number of bytes in the underlying data """ return self._nbytes(False) - def _nbytes(self, deep=False): + def _nbytes(self, deep: bool = False) -> int: """ return the number of bytes in the underlying data deeply introspect the level data if deep=True @@ -1123,7 +1072,8 @@ def _format_native_types(self, na_rep="nan", **kwargs): if mask.any(): nan_index = len(level) level = np.append(level, na_rep) - level_codes = level_codes.values() + assert not level_codes.flags.writeable # i.e. copy is needed + level_codes = level_codes.copy() # make writeable level_codes[mask] = nan_index new_levels.append(level) new_codes.append(level_codes) @@ -1216,7 +1166,7 @@ def format( # -------------------------------------------------------------------- - def __len__(self): + def __len__(self) -> int: return len(self.codes[0]) def _get_names(self): @@ -1253,17 +1203,18 @@ def _set_names(self, names, level=None, validate=True): raise ValueError("Names should be list-like for a MultiIndex") names = list(names) - if validate and level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if validate and level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) + if validate: + if level is not None and len(names) != len(level): + raise ValueError("Length of names must match length of level.") + if level is None and len(names) != self.nlevels: + raise ValueError( + "Length of names must match number of levels in MultiIndex." + ) if level is None: level = range(self.nlevels) else: - level = [self._get_level_number(l) for l in level] + level = [self._get_level_number(lev) for lev in level] # set the name for lev, name in zip(level, names): @@ -1272,9 +1223,7 @@ def _set_names(self, names, level=None, validate=True): # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError( - "{}.name must be a hashable type".format( - self.__class__.__name__ - ) + f"{type(self).__name__}.name must be a hashable type" ) self._names[lev] = name @@ -1320,34 +1269,33 @@ def _constructor(self): return MultiIndex.from_tuples @cache_readonly - def inferred_type(self): + def inferred_type(self) -> str: return "mixed" - def _get_level_number(self, level): + def _get_level_number(self, level) -> int: count = self.names.count(level) if (count > 1) and not is_integer(level): raise ValueError( - "The name %s occurs multiple times, use a level number" % level + f"The name {level} occurs multiple times, use a level number" ) try: level = self.names.index(level) except ValueError: if not is_integer(level): - raise KeyError("Level %s not found" % str(level)) + raise KeyError(f"Level {level} not found") elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels raise IndexError( - "Too many levels: Index has only %d " - "levels, %d is not a valid level number" - % (self.nlevels, orig_level) + f"Too many levels: Index has only {self.nlevels} levels," + f" {orig_level} is not a valid level number" ) # Note: levels are zero-based elif level >= self.nlevels: raise IndexError( - "Too many levels: Index has only %d levels, " - "not %d" % (self.nlevels, level + 1) + f"Too many levels: Index has only {self.nlevels} levels, " + f"not {level + 1}" ) return level @@ -1394,13 +1342,8 @@ def values(self): self._tuples = lib.fast_zip(values) return self._tuples - @property - def _has_complex_internals(self): - # to disable groupby tricks - return True - @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ return if the index is monotonic increasing (only equal or increasing) values. @@ -1425,7 +1368,7 @@ def is_monotonic_increasing(self): return Index(self.values).is_monotonic @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1504,7 +1447,7 @@ def dropna(self, how="any"): elif how == "all": indexer = np.all(nans, axis=0) else: - raise ValueError("invalid how option: {0}".format(how)) + raise ValueError(f"invalid how option: {how}") new_codes = [level_codes[~indexer] for level_codes in self.codes] return self.copy(codes=new_codes, deep=True) @@ -1686,17 +1629,12 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - # Guarantee resulting column order + # Guarantee resulting column order - PY36+ dict maintains insertion order result = DataFrame( - OrderedDict( - [ - ( - (level if lvlname is None else lvlname), - self._get_level_values(level), - ) - for lvlname, level in zip(idx_names, range(len(self.levels))) - ] - ), + { + (level if lvlname is None else lvlname): self._get_level_values(level) + for lvlname, level in zip(idx_names, range(len(self.levels))) + }, copy=False, ) @@ -1704,62 +1642,6 @@ def to_frame(self, index=True, name=None): result.index = self return result - def to_hierarchical(self, n_repeat, n_shuffle=1): - """ - Return a MultiIndex reshaped to conform to the - shapes given by n_repeat and n_shuffle. - - .. deprecated:: 0.24.0 - - Useful to replicate and rearrange a MultiIndex for combination - with another Index with n_repeat items. - - Parameters - ---------- - n_repeat : int - Number of times to repeat the labels on self - n_shuffle : int - Controls the reordering of the labels. If the result is going - to be an inner level in a MultiIndex, n_shuffle will need to be - greater than one. The size of each label must divisible by - n_shuffle. - - Returns - ------- - MultiIndex - - Examples - -------- - >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')]) - >>> idx.to_hierarchical(3) - MultiIndex([(1, 'one'), - (1, 'one'), - (1, 'one'), - (1, 'two'), - (1, 'two'), - (1, 'two'), - (2, 'one'), - (2, 'one'), - (2, 'one'), - (2, 'two'), - (2, 'two'), - (2, 'two')], - ) - """ - levels = self.levels - codes = [np.repeat(level_codes, n_repeat) for level_codes in self.codes] - # Assumes that each level_codes is divisible by n_shuffle - codes = [x.reshape(n_shuffle, -1).ravel(order="F") for x in codes] - names = self.names - warnings.warn( - "Method .to_hierarchical is deprecated and will " - "be removed in a future version", - FutureWarning, - stacklevel=2, - ) - return MultiIndex(levels=levels, codes=codes, names=names) - def to_flat_index(self): """ Convert a MultiIndex to an Index of Tuples containing the level values. @@ -1789,10 +1671,10 @@ def to_flat_index(self): return Index(self.values, tupleize_cols=False) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: return False - def is_lexsorted(self): + def is_lexsorted(self) -> bool: """ Return True if the codes are lexicographically sorted. @@ -1979,11 +1861,11 @@ def remove_unused_levels(self): return result @property - def nlevels(self): + def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. """ - return len(self.levels) + return len(self._levels) @property def levshape(self): @@ -1995,12 +1877,12 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" d = dict( - levels=[lev for lev in self.levels], - codes=[level_codes for level_codes in self.codes], + levels=list(self.levels), + codes=list(self.codes), sortorder=self.sortorder, names=list(self.names), ) - return ibase._new_Index, (self.__class__, d), None + return ibase._new_Index, (type(self), d), None def __setstate__(self, state): """Necessary for making this object picklable""" @@ -2089,7 +1971,7 @@ def _assert_take_fillable( if mask.any(): masked = [] for new_label in taken: - label_values = new_label.values() + label_values = new_label label_values[mask] = na_value masked.append(np.asarray(label_values)) taken = masked @@ -2152,7 +2034,6 @@ def repeat(self, repeats, axis=None): def where(self, cond, other=None): raise NotImplementedError(".where is not supported for MultiIndex operations") - @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") def drop(self, codes, level=None, errors="raise"): """ Make new MultiIndex with passed list of codes deleted @@ -2162,24 +2043,20 @@ def drop(self, codes, level=None, errors="raise"): codes : array-like Must be a list of tuples level : int or level name, default None + errors : str, default 'raise' Returns ------- dropped : MultiIndex """ if level is not None: - return self._drop_from_level(codes, level) + return self._drop_from_level(codes, level, errors) - try: - if not isinstance(codes, (np.ndarray, Index)): + if not isinstance(codes, (np.ndarray, Index)): + try: codes = com.index_labels_to_array(codes) - indexer = self.get_indexer(codes) - mask = indexer == -1 - if mask.any(): - if errors != "ignore": - raise ValueError("codes %s not contained in axis" % codes[mask]) - except Exception: - pass + except ValueError: + pass inds = [] for level_codes in codes: @@ -2194,16 +2071,15 @@ def drop(self, codes, level=None, errors="raise"): elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( - "dropping on a non-lexsorted multi-index" - " without a level parameter may impact " - "performance.", + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", PerformanceWarning, stacklevel=3, ) loc = loc.nonzero()[0] inds.extend(loc) else: - msg = "unsupported indexer of type {}".format(type(loc)) + msg = f"unsupported indexer of type {type(loc)}" raise AssertionError(msg) except KeyError: if errors != "ignore": @@ -2211,13 +2087,15 @@ def drop(self, codes, level=None, errors="raise"): return self.delete(inds) - def _drop_from_level(self, codes, level): + def _drop_from_level(self, codes, level, errors="raise"): codes = com.index_labels_to_array(codes) i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) mask = ~algos.isin(self.codes[i], values) + if mask.all() and errors != "ignore": + raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2293,8 +2171,8 @@ def reorder_levels(self, order): order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: raise AssertionError( - "Length of order must be same as " - "number of levels (%d), got %d" % (self.nlevels, len(order)) + f"Length of order must be same as number of levels ({self.nlevels})," + f" got {len(order)}" ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] @@ -2304,9 +2182,6 @@ def reorder_levels(self, order): levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) - def __getslice__(self, i, j): - return self.__getitem__(slice(i, j)) - def _get_codes_for_sorting(self): """ we categorizing our codes by using the @@ -2335,11 +2210,11 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Parameters ---------- level : list-like, int or str, default 0 - If a string is given, must be a name of the level + If a string is given, must be a name of the level. If list-like must be names or ints of levels. ascending : bool, default True - False to sort in descending order - Can also be a list to specify a directed ordering + False to sort in descending order. + Can also be a list to specify a directed ordering. sort_remaining : sort by the remaining levels after level Returns @@ -2430,7 +2305,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None): check = self.levels[0].get_indexer(keyarr) mask = check == -1 if mask.any(): - raise KeyError("%s not in index" % keyarr[mask]) + raise KeyError(f"{keyarr[mask]} not in index") return indexer, keyarr @@ -2541,7 +2416,53 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): return target, indexer - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left", kind="loc") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right", kind="loc") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ if not isinstance(label, tuple): label = (label,) @@ -2606,8 +2527,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - "Key length (%d) was greater than MultiIndex" - " lexsort depth (%d)" % (len(tup), self.lexsort_depth) + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" + f" ({self.lexsort_depth})" ) n = len(tup) @@ -2616,9 +2537,9 @@ def _partial_tup_index(self, tup, side="left"): for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] - if lab not in lev: + if lab not in lev and not isna(lab): if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): - raise TypeError("Level type mismatch: %s" % lab) + raise TypeError(f"Level type mismatch: {lab}") # short circuit loc = lev.searchsorted(lab, side=side) @@ -2626,13 +2547,38 @@ def _partial_tup_index(self, tup, side="left"): loc -= 1 return start + section.searchsorted(loc, side=side) - idx = lev.get_loc(lab) + idx = self._get_loc_single_level_index(lev, lab) if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + def get_loc(self, key, method=None): """ Get location for a label or a tuple of labels as an integer, slice or @@ -2700,8 +2646,7 @@ def _maybe_to_slice(loc): keylen = len(key) if self.nlevels < keylen: raise KeyError( - "Key length ({0}) exceeds index depth ({1})" - "".format(keylen, self.nlevels) + f"Key length ({keylen}) exceeds index depth ({self.nlevels})" ) if keylen == self.nlevels and self.is_unique: @@ -2732,7 +2677,9 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): - mask = self.codes[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2750,7 +2697,7 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): key : label or sequence of labels level : int/level name or list thereof, optional drop_level : bool, default True - if ``False``, the resulting index will not drop any level. + If ``False``, the resulting index will not drop any level. Returns ------- @@ -2960,7 +2907,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = level_index.get_loc(key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted @@ -2979,47 +2926,46 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): def get_locs(self, seq): """ - Get location for a given label/slice/list/mask or a sequence of such as - an array of integers. + Get location for a sequence of labels. Parameters ---------- - seq : label/slice/list/mask or a sequence of such + seq : label, slice, list, mask or a sequence of such You should use one of the above for each level. If a level should not be used, set it to ``slice(None)``. Returns ------- - locs : array of integers suitable for passing to iloc + numpy.ndarray + NumPy array of integers suitable for passing to iloc. + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.slice_locs : Get slice location given start label(s) and + end label(s). Examples -------- >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')]) - >>> mi.get_locs('b') + >>> mi.get_locs('b') # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([slice(None), ['e', 'f']]) + >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP array([1, 2], dtype=int64) - >>> mi.get_locs([[True, False, True], slice('e', 'f')]) + >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) - - See Also - -------- - MultiIndex.get_loc : Get location for a label or a tuple of labels. - MultiIndex.slice_locs : Get slice location given start label(s) and - end label(s). """ - from .numeric import Int64Index + from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] if true_slices and true_slices[-1] >= self.lexsort_depth: raise UnsortedIndexError( - "MultiIndex slicing requires the index " - "to be lexsorted: slicing on levels {0}, " - "lexsort depth {1}".format(true_slices, self.lexsort_depth) + "MultiIndex slicing requires the index to be lexsorted: slicing " + f"on levels {true_slices}, lexsort depth {self.lexsort_depth}" ) # indexer # this is the list of all values that we want to select @@ -3133,7 +3079,7 @@ def truncate(self, before=None, after=None): return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3149,6 +3095,11 @@ def equals(self, other): return False if not isinstance(other, MultiIndex): + # d-level MultiIndex can equal d-tuple Index + if not is_object_dtype(other.dtype): + if self.nlevels != other.nlevels: + return False + other_vals = com.values_from_object(ensure_index(other)) return array_equivalent(self._ndarray_values, other_vals) @@ -3356,10 +3307,10 @@ def astype(self, dtype, copy=True): msg = "> 1 ndim Categorical are not supported at this time" raise NotImplementedError(msg) elif not is_object_dtype(dtype): - msg = ( - "Setting {cls} dtype to anything other than object is not supported" - ).format(cls=self.__class__) - raise TypeError(msg) + raise TypeError( + f"Setting {type(self)} dtype to anything other " + "than object is not supported" + ) elif copy is True: return self._shallow_copy() return self @@ -3451,14 +3402,11 @@ def isin(self, values, level=None): return algos.isin(self.values, values) else: num = self._get_level_number(level) - levs = self.levels[num] - level_codes = self.codes[num] + levs = self.get_level_values(num) - sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(level_codes), dtype=np.bool_) - else: - return np.lib.arraysetops.in1d(level_codes, sought_labels) + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) MultiIndex._add_numeric_methods_disabled() @@ -3466,7 +3414,7 @@ def isin(self, values, level=None): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start=0, sentinel=""): +def _sparsify(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) @@ -3494,7 +3442,7 @@ def _sparsify(label_list, start=0, sentinel=""): return list(zip(*result)) -def _get_na_rep(dtype): +def _get_na_rep(dtype) -> str: return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") @@ -3527,3 +3475,26 @@ def maybe_droplevels(index, key): pass return index + + +def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray: + """ + Coerce the array_like indexer to the smallest integer dtype that can encode all + of the given categories. + + Parameters + ---------- + array_like : array-like + categories : array-like + copy : bool + + Returns + ------- + np.ndarray + Non-writeable. + """ + array_like = coerce_indexer_dtype(array_like, categories) + if copy: + array_like = array_like.copy() + array_like.flags.writeable = False + return array_like diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index e83360dc701f3..b9b44284edaa9 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,8 +1,7 @@ -import warnings - import numpy as np -from pandas._libs import index as libindex +from pandas._libs import index as libindex, lib +from pandas._typing import Dtype from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.cast import astype_nansafe @@ -15,6 +14,8 @@ is_float_dtype, is_integer_dtype, is_scalar, + is_signed_integer_dtype, + is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) @@ -22,13 +23,19 @@ ABCFloat64Index, ABCInt64Index, ABCRangeIndex, + ABCSeries, ABCUInt64Index, ) from pandas.core.dtypes.missing import isna from pandas.core import algorithms import pandas.core.common as com -from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _index_shared_docs, + maybe_extract_name, +) from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -36,28 +43,26 @@ class NumericIndex(Index): """ - Provide numeric type operations - - This is an abstract class + Provide numeric type operations. + This is an abstract class. """ _is_numeric_dtype = True - def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): + def __new__(cls, data=None, dtype=None, copy=False, name=None): + cls._validate_dtype(dtype) - if fastpath is not None: - warnings.warn( - "The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, - stacklevel=2, - ) - if fastpath: - return cls._simple_new(data, name=name) + # Coerce to ndarray if not already ndarray or Index + if not isinstance(data, (np.ndarray, Index)): + if is_scalar(data): + raise cls._scalar_data_error(data) - # is_scalar, generators handled in coerce_to_ndarray - data = cls._coerce_to_ndarray(data) + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + + data = np.asarray(data, dtype=dtype) if issubclass(data.dtype.type, str): cls._string_data_error(data) @@ -68,10 +73,30 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): else: subarr = data - if name is None and hasattr(data, "name"): - name = data.name + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + + name = maybe_extract_name(name, data, cls) return cls._simple_new(subarr, name=name) + @classmethod + def _validate_dtype(cls, dtype: Dtype) -> None: + if dtype is None: + return + validation_metadata = { + "int64index": (is_signed_integer_dtype, "signed integer"), + "uint64index": (is_unsigned_integer_dtype, "unsigned integer"), + "float64index": (is_float_dtype, "float"), + "rangeindex": (is_signed_integer_dtype, "signed integer"), + } + + validation_func, expected = validation_metadata[cls._typ] + if not validation_func(dtype): + raise ValueError( + f"Incorrect `dtype` passed: expected {expected}, received {dtype}" + ) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["ix", "loc", "getitem", None] @@ -87,8 +112,9 @@ def _shallow_copy(self, values=None, **kwargs): return super()._shallow_copy(values=values, **kwargs) def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - + """ + Convert value to be insertable to ndarray. + """ if is_bool(value) or is_bool_dtype(value): # force conversion to object # so we don't lose the bools @@ -103,19 +129,13 @@ def _convert_tolerance(self, tolerance, target): if not np.issubdtype(tolerance.dtype, np.number): if tolerance.ndim > 0: raise ValueError( - ( - "tolerance argument for %s must contain " - "numeric elements if it is list type" - ) - % (type(self).__name__,) + f"tolerance argument for {type(self).__name__} must contain " + "numeric elements if it is list type" ) else: raise ValueError( - ( - "tolerance argument for %s must be numeric " - "if it is a scalar: %r" - ) - % (type(self).__name__, tolerance) + f"tolerance argument for {type(self).__name__} must be numeric " + f"if it is a scalar: {repr(tolerance)}" ) return tolerance @@ -133,9 +153,9 @@ def _concat_same_dtype(self, indexes, name): return result.rename(name) @property - def is_all_dates(self): + def is_all_dates(self) -> bool: """ - Checks that all the labels are datetime objects + Checks that all the labels are datetime objects. """ return False @@ -169,16 +189,16 @@ def _union(self, other, sort): ] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. %(klass)s is a special case - of `Index` with purely %(ltype)s labels. %(extra)s + of `Index` with purely %(ltype)s labels. %(extra)s. Parameters ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: %(dtype)s) copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. name : object - Name to be stored in the index + Name to be stored in the index. Attributes ---------- @@ -205,7 +225,7 @@ class IntegerIndex(NumericIndex): This is an abstract class for Int64Index, UInt64Index. """ - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Check if key is a float and has a decimal. If it has, return False. """ @@ -227,12 +247,14 @@ class Int64Index(IntegerIndex): _default_dtype = np.int64 @property - def inferred_type(self): - """Always 'integer' for ``Int64Index``""" + def inferred_type(self) -> str: + """ + Always 'integer' for ``Int64Index`` + """ return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("i8") @@ -282,12 +304,14 @@ class UInt64Index(IntegerIndex): _default_dtype = np.uint64 @property - def inferred_type(self): - """Always 'integer' for ``UInt64Index``""" + def inferred_type(self) -> str: + """ + Always 'integer' for ``UInt64Index`` + """ return "integer" @property - def asi8(self): + def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak return self.values.view("u8") @@ -302,13 +326,15 @@ def _convert_scalar_indexer(self, key, kind=None): @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): - # Cast the indexer to uint64 if possible so - # that the values returned from indexing are - # also uint64. - keyarr = com.asarray_tuplesafe(keyarr) - if is_integer_dtype(keyarr): - return com.asarray_tuplesafe(keyarr, dtype=np.uint64) - return keyarr + # Cast the indexer to uint64 if possible so that the values returned + # from indexing are also uint64. + dtype = None + if is_integer_dtype(keyarr) or ( + lib.infer_dtype(keyarr, skipna=False) == "integer" + ): + dtype = np.uint64 + + return com.asarray_tuplesafe(keyarr, dtype=dtype) @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): @@ -355,19 +381,20 @@ class Float64Index(NumericIndex): _default_dtype = np.float64 @property - def inferred_type(self): - """Always 'floating' for ``Float64Index``""" + def inferred_type(self) -> str: + """ + Always 'floating' for ``Float64Index`` + """ return "floating" @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): - msg = ( - "Cannot convert Float64Index to dtype {dtype}; integer " + raise TypeError( + f"Cannot convert Float64Index to dtype {dtype}; integer " "values are required for conversion" - ).format(dtype=dtype) - raise TypeError(msg) + ) elif is_integer_dtype(dtype) and not is_extension_array_dtype(dtype): # TODO(jreback); this can change once we have an EA Index type # GH 13149 @@ -412,7 +439,9 @@ def _format_native_types( return formatter.get_result_as_array() def get_value(self, series, key): - """ we always want to get an index value, never a value """ + """ + We always want to get an index value, never a value. + """ if not is_scalar(key): raise InvalidIndexError @@ -422,7 +451,7 @@ def get_value(self, series, key): return new_values - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -444,7 +473,7 @@ def equals(self, other): except (TypeError, ValueError): return False - def __contains__(self, other): + def __contains__(self, other) -> bool: if super().__contains__(other): return True @@ -479,7 +508,7 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 @Appender(Index.isin.__doc__) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index a20290e77023a..6ab2e66e05d6e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,37 +1,47 @@ from datetime import datetime, timedelta -import warnings import weakref import numpy as np from pandas._libs import index as libindex from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution -from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period +from pandas._libs.tslibs.period import Period from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, + is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, + is_object_dtype, pandas_dtype, ) -from pandas.core import common as com from pandas.core.accessor import delegate_names -from pandas.core.algorithms import unique1d -from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq +from pandas.core.arrays.period import ( + PeriodArray, + period_array, + raise_on_incompatible, + validate_dtype_freq, +) from pandas.core.base import _shared_docs +import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import _index_shared_docs, ensure_index +from pandas.core.indexes.base import ( + _index_shared_docs, + ensure_index, + maybe_extract_name, +) from pandas.core.indexes.datetimelike import ( DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ) -from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index +from pandas.core.indexes.datetimes import DatetimeIndex, Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name from pandas.core.tools.datetimes import DateParseError, parse_time_string @@ -62,13 +72,11 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): Delegate from PeriodIndex to PeriodArray. """ - _delegate_class = PeriodArray - _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = set(PeriodArray._datetimelike_methods) | { - "_addsub_int_array", - "strftime", - } - _raw_properties = {"is_leap_year"} + _raw_methods = {"_format_native_types"} + _raw_properties = {"is_leap_year", "freq"} + + _delegated_properties = PeriodArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = set(PeriodArray._datetimelike_methods) | _raw_methods @delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") @@ -77,8 +85,7 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): ) class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ - Immutable ndarray holding ordinal values indicating regular periods in - time such as particular years, quarters, months, etc. + Immutable ndarray holding ordinal values indicating regular periods in time. Index keys are boxed to Period objects which carries the metadata (eg, frequency information). @@ -86,29 +93,11 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): Parameters ---------- data : array-like (1d int np.ndarray or PeriodArray), optional - Optional period-like data to construct index with + Optional period-like data to construct index with. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. freq : str or period object, optional One of pandas period strings or corresponding objects - start : starting value, period-like, optional - If data is None, used as the start point in generating regular - period data. - - .. deprecated:: 0.24.0 - - periods : int, optional, > 0 - Number of periods to generate, if generating index. Takes precedence - over end argument - - .. deprecated:: 0.24.0 - - end : end value, period-like, optional - If periods is none, generated index will extend to first conforming - period on or just past end argument - - .. deprecated:: 0.24.0 - year : int, array, or Series, default None month : int, array, or Series, default None quarter : int, array, or Series, default None @@ -117,7 +106,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): minute : int, array, or Series, default None second : int, array, or Series, default None tz : object, default None - Timezone for converting datetime64 data to Periods + Timezone for converting datetime64 data to Periods. dtype : str or PeriodDtype, default None Attributes @@ -157,11 +146,6 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): TimedeltaIndex : Index of timedelta64 data. period_range : Create a fixed-frequency PeriodIndex. - Notes - ----- - Creating a PeriodIndex based on `start`, `periods`, and `end` has - been deprecated in favor of :func:`period_range`. - Examples -------- >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr) @@ -174,7 +158,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): _is_numeric_dtype = False _infer_as_myclass = True - _data = None + _data: PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -187,14 +171,11 @@ def __new__( data=None, ordinal=None, freq=None, - start=None, - end=None, - periods=None, tz=None, dtype=None, copy=False, name=None, - **fields + **fields, ): valid_field_set = { @@ -208,40 +189,16 @@ def __new__( } if not set(fields).issubset(valid_field_set): - raise TypeError( - "__new__() got an unexpected keyword argument {}".format( - list(set(fields) - valid_field_set)[0] - ) - ) + argument = list(set(fields) - valid_field_set)[0] + raise TypeError(f"__new__() got an unexpected keyword argument {argument}") - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) if data is None and ordinal is None: # range-based. - data, freq2 = PeriodArray._generate_range(start, end, periods, freq, fields) - # PeriodArray._generate range does validate that fields is + data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields) + # PeriodArray._generate range does validation that fields is # empty when really using the range-based constructor. - if not fields: - msg = ( - "Creating a PeriodIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.period_range` instead." - ) - # period_range differs from PeriodIndex for cases like - # start="2000", periods=4 - # PeriodIndex interprets that as A-DEC freq. - # period_range interprets it as 'D' freq. - cond = freq is None and ( - (start and not isinstance(start, Period)) - or (end and not isinstance(end, Period)) - ) - if cond: - msg += ( - " Note that the default `freq` may differ. Pass " - "'freq=\"{}\"' to ensure the same output." - ).format(freq2.freqstr) - warnings.warn(msg, FutureWarning, stacklevel=2) freq = freq2 data = PeriodArray(data, freq=freq) @@ -309,44 +266,20 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): def values(self): return np.asarray(self) - @property - def freq(self): - return self._data.freq - - @freq.setter - def freq(self, value): - value = Period._maybe_convert_freq(value) - # TODO: When this deprecation is enforced, PeriodIndex.freq can - # be removed entirely, and we'll just inherit. - msg = ( - "Setting {cls}.freq has been deprecated and will be " - "removed in a future version; use {cls}.asfreq instead. " - "The {cls}.freq setter is not guaranteed to work." - ) - warnings.warn(msg.format(cls=type(self).__name__), FutureWarning, stacklevel=2) - # PeriodArray._freq isn't actually mutable. We set the private _freq - # here, but people shouldn't be doing this anyway. - self._data._freq = value - def _shallow_copy(self, values=None, **kwargs): # TODO: simplify, figure out type of values if values is None: values = self._data if isinstance(values, type(self)): - values = values._values + values = values._data if not isinstance(values, PeriodArray): - if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype): + if isinstance(values, np.ndarray) and values.dtype == "i8": values = PeriodArray(values, freq=self.freq) else: - # in particular, I would like to avoid period_array here. - # Some people seem to be calling use with unexpected types - # Index.difference -> ndarray[Period] - # DatetimelikeIndexOpsMixin.repeat -> ndarray[ordinal] - # I think that once all of Datetime* are EAs, we can simplify - # this quite a bit. - values = period_array(values, freq=self.freq) + # GH#30713 this should never be reached + raise TypeError(type(values), getattr(values, "dtype", None)) # We don't allow changing `freq` in _shallow_copy. validate_dtype_freq(self.dtype, kwargs.get("freq")) @@ -406,10 +339,7 @@ def _maybe_convert_timedelta(self, other): if base == self.freq.rule_code: return other.n - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically @@ -417,18 +347,11 @@ def _maybe_convert_timedelta(self, other): return other # raise when input doesn't have freq - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=None - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, None) # ------------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object).values @@ -447,7 +370,7 @@ def _engine(self): return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) - def __contains__(self, key): + def __contains__(self, key) -> bool: if isinstance(key, Period): if key.freq != self.freq: return False @@ -468,17 +391,7 @@ def _int64index(self): # ------------------------------------------------------------------------ # Index Methods - def _coerce_scalar_to_index(self, item): - """ - we need to coerce a scalar to a compat for our index type - - Parameters - ---------- - item : scalar item to coerce - """ - return PeriodIndex([item], **self._get_attributes_dict()) - - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if is_integer_dtype(dtype): return self.asi8 else: @@ -506,10 +419,10 @@ def __array_wrap__(self, result, context=None): return Index(result, name=name) elif isinstance(func, np.ufunc): if "M->M" not in func.types: - msg = "ufunc '{0}' not supported for the PeriodIndex" + msg = f"ufunc '{func.__name__}' not supported for the PeriodIndex" # This should be TypeError, but TypeError cannot be raised # from here because numpy catches. - raise ValueError(msg.format(func.__name__)) + raise ValueError(msg) if is_bool_dtype(result): return result @@ -556,29 +469,22 @@ def astype(self, dtype, copy=True, how="start"): @Substitution(klass="PeriodIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, Period): - if value.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=value.freqstr, - ) - raise IncompatibleFrequency(msg) - value = value.ordinal + if isinstance(value, Period) or value is NaT: + self._data._check_compatible_with(value) elif isinstance(value, str): try: - value = Period(value, freq=self.freq).ordinal + value = Period(value, freq=self.freq) except DateParseError: - raise KeyError("Cannot interpret '{}' as period".format(value)) - - return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) + raise KeyError(f"Cannot interpret '{value}' as period") + elif not isinstance(value, PeriodArray): + raise TypeError( + "PeriodIndex.searchsorted requires either a Period or PeriodArray" + ) - @property - def is_all_dates(self): - return True + return self._data.searchsorted(value, side=side, sorter=sorter) @property - def is_full(self): + def is_full(self) -> bool: """ Returns True if this PeriodIndex is range-like in that all Periods between start and end are present, in order. @@ -591,7 +497,7 @@ def is_full(self): return ((values[1:] - values[:-1]) < 2).all() @property - def inferred_type(self): + def inferred_type(self) -> str: # b/c data is represented as ints make sure we can't have ambiguous # indexing return "period" @@ -603,7 +509,7 @@ def get_value(self, series, key): """ s = com.values_from_object(series) try: - return com.maybe_box(self, super().get_value(s, key), series, key) + value = super().get_value(s, key) except (KeyError, IndexError): if isinstance(key, str): asdt, parsed, reso = parse_time_string(key, self.freq) @@ -635,20 +541,19 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal return com.maybe_box(self, self._int64index.get_value(s, key), series, key) + else: + return com.maybe_box(self, value, series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) - if isinstance(target, PeriodIndex): + if target.freq != self.freq: + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + target = target.asi8 self_index = self._int64index else: @@ -663,39 +568,15 @@ def get_indexer_non_unique(self, target): target = ensure_index(target) if isinstance(target, PeriodIndex): + if target.freq != self.freq: + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + target = target.asi8 - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing - def _get_unique_index(self, dropna=False): - """ - wrap Index._get_unique_index to handle NaT - """ - res = super()._get_unique_index(dropna=dropna) - if dropna: - res = res.dropna() - return res - - @Appender(Index.unique.__doc__) - def unique(self, level=None): - # override the Index.unique method for performance GH#23083 - if level is not None: - # this should never occur, but is retained to make the signature - # match Index.unique - self._validate_index_level(level) - - values = self._ndarray_values - result = unique1d(values) - return self._shallow_copy(result) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -717,7 +598,7 @@ def get_loc(self, key, method=None, tolerance=None): pass except DateParseError: # A string with invalid format - raise KeyError("Cannot interpret '{}' as period".format(key)) + raise KeyError(f"Cannot interpret '{key}' as period") try: key = Period(key, freq=self.freq) @@ -826,8 +707,7 @@ def _get_string_slice(self, key): t1, t2 = self._parsed_string_to_bounds(reso, parsed) return slice( - self.searchsorted(t1.ordinal, side="left"), - self.searchsorted(t2.ordinal, side="right"), + self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") ) def _convert_tolerance(self, tolerance, target): @@ -870,9 +750,8 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._apply_meta(result), lidx, ridx return self._apply_meta(result) - @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False): - return Index.intersection(self, other, sort=sort) + # ------------------------------------------------------------------------ + # Set Operation Methods def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) @@ -880,107 +759,79 @@ def _assert_can_do_setop(self, other): # *Can't* use PeriodIndexes of different freqs # *Can* use PeriodIndex/DatetimeIndex if isinstance(other, PeriodIndex) and self.freq != other.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - result = self._apply_meta(result) - result.name = name - return result + def intersection(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) - def _apply_meta(self, rawarr): - if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) - return rawarr + if self.equals(other): + return self._get_reconciled_name_object(other) - def __setstate__(self, state): - """Necessary for making this object picklable""" + if not is_dtype_equal(self.dtype, other.dtype): + # TODO: fastpath for if we have a different PeriodDtype + this = self.astype("O") + other = other.astype("O") + return this.intersection(other, sort=sort) - if isinstance(state, dict): - super().__setstate__(state) + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.intersection(i8other, sort=sort) - elif isinstance(state, tuple): + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) + def difference(self, other, sort=None): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) - # backcompat - freq = Period._maybe_convert_freq(own_state[1]) + if self.equals(other): + # pass an empty PeriodArray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(self, state) - freq = None # ? + if is_object_dtype(other): + return self.astype(object).difference(other).astype(self.dtype) - data = PeriodArray(data, freq=freq) - self._data = data + elif not is_dtype_equal(self.dtype, other.dtype): + return self - else: - raise Exception("invalid pickle state") + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.difference(i8other, sort=sort) - _unpickle_compat = __setstate__ + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - @property - def flags(self): - """ return the ndarray.flags for the underlying data """ - warnings.warn( - "{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return self._ndarray_values.flags + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) - def item(self): - """ - return the first element of the underlying data as a python - scalar + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) - .. deprecated:: 0.25.0 + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this._union(other, sort=sort) - """ - warnings.warn( - "`item` has been deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) - # TODO(DatetimeArray): remove - if len(self) == 1: - return self[0] - else: - # TODO: is this still necessary? - # copy numpy's message here because Py26 raises an IndexError - raise ValueError("can only convert an array of size 1 to a Python scalar") + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self._union(i8other, sort=sort) - @property - def data(self): - """ return the data pointer of the underlying data """ - warnings.warn( - "{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return np.asarray(self._data).data + res_name = get_op_result_name(self, other) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - @property - def base(self): - """ return the base object if the memory of the underlying data is - shared - """ - warnings.warn( - "{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, - stacklevel=2, - ) - return np.asarray(self._data) + # ------------------------------------------------------------------------ + + def _apply_meta(self, rawarr): + if not isinstance(rawarr, PeriodIndex): + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) + return rawarr def memory_usage(self, deep=False): result = super().memory_usage(deep=deep) @@ -989,36 +840,36 @@ def memory_usage(self, deep=False): return result -PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() PeriodIndex._add_logical_methods_disabled() -PeriodIndex._add_datetimelike_methods() -def period_range(start=None, end=None, periods=None, freq=None, name=None): +def period_range( + start=None, end=None, periods=None, freq=None, name=None +) -> PeriodIndex: """ - Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency. + Return a fixed frequency PeriodIndex. + + The day (calendar) is the default frequency. Parameters ---------- start : str or period-like, default None - Left bound for generating periods + Left bound for generating periods. end : str or period-like, default None - Right bound for generating periods + Right bound for generating periods. periods : int, default None - Number of periods to generate + Number of periods to generate. freq : str or DateOffset, optional Frequency alias. By default the freq is taken from `start` or `end` if those are Period objects. Otherwise, the default is ``"D"`` for daily frequency. - name : str, default None - Name of the resulting PeriodIndex + Name of the resulting PeriodIndex. Returns ------- - prng : PeriodIndex + PeriodIndex Notes ----- @@ -1026,7 +877,7 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None): must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6e2d500f4c5ab..b4cc71a25792f 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Union +from typing import Optional, Union import warnings import numpy as np @@ -14,21 +14,21 @@ from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex +from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.io.formats.printing import pprint_thing @@ -51,7 +51,7 @@ class RangeIndex(Int64Index): stop : int (default: 0) step : int (default: 1) name : object, optional - Name to be stored in the index + Name to be stored in the index. copy : bool, default False Unused, accepted for homogeneity with other index types. @@ -73,39 +73,22 @@ class RangeIndex(Int64Index): _typ = "rangeindex" _engine_type = libindex.Int64Engine - _range = None # type: range + _range: range # check whether self._data has been called - _cached_data = None # type: np.ndarray + _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors def __new__( - cls, - start=None, - stop=None, - step=None, - dtype=None, - copy=False, - name=None, - fastpath=None, + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, ): - if fastpath is not None: - warnings.warn( - "The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, - stacklevel=2, - ) - if fastpath: - return cls._simple_new(range(start, stop, step), name=name) - cls._validate_dtype(dtype) + name = maybe_extract_name(name, start, cls) # RangeIndex if isinstance(start, RangeIndex): - name = start.name if name is None else name start = start._range return cls._simple_new(start, dtype=dtype, name=name) @@ -138,15 +121,15 @@ def from_range(cls, data, name=None, dtype=None): """ if not isinstance(data, range): raise TypeError( - "{0}(...) must be called with object coercible to a " - "range, {1} was passed".format(cls.__name__, repr(data)) + f"{cls.__name__}(...) must be called with object coercible to a " + f"range, {repr(data)} was passed" ) cls._validate_dtype(dtype) return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) # handle passed None, non-integers @@ -154,25 +137,16 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # empty values = range(0, 0, 1) elif not isinstance(values, range): - return Index(values, dtype=dtype, name=name, **kwargs) + return Index(values, dtype=dtype, name=name) result._range = values - result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() return result # -------------------------------------------------------------------- - @staticmethod - def _validate_dtype(dtype): - """ require dtype to be None or int64 """ - if not (dtype is None or is_int64_dtype(dtype)): - raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex") - @cache_readonly def _constructor(self): """ return the class to use for construction """ @@ -205,7 +179,7 @@ def _get_data_as_items(self): def __reduce__(self): d = self._get_attributes_dict() d.update(dict(self._get_data_as_items())) - return ibase._new_Index, (self.__class__, d), None + return ibase._new_Index, (type(self), d), None # -------------------------------------------------------------------- # Rendering Methods @@ -251,7 +225,7 @@ def _start(self): """ warnings.warn( self._deprecation_message.format("_start", "start"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.start @@ -274,7 +248,7 @@ def _stop(self): # GH 25710 warnings.warn( self._deprecation_message.format("_stop", "stop"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.stop @@ -298,13 +272,13 @@ def _step(self): # GH 25710 warnings.warn( self._deprecation_message.format("_step", "step"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.step @cache_readonly - def nbytes(self): + def nbytes(self) -> int: """ Return the number of bytes in the underlying data. """ @@ -314,7 +288,7 @@ def nbytes(self): for attr_name in ["start", "stop", "step"] ) - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values @@ -340,24 +314,24 @@ def memory_usage(self, deep=False): return self.nbytes @property - def dtype(self): + def dtype(self) -> np.dtype: return np.dtype(np.int64) @property - def is_unique(self): + def is_unique(self) -> bool: """ return if the index has unique values """ return True @cache_readonly - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: return self._range.step > 0 or len(self) <= 1 @cache_readonly - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: return self._range.step < 0 or len(self) <= 1 @property - def has_duplicates(self): + def has_duplicates(self) -> bool: return False def __contains__(self, key: Union[int, np.integer]) -> bool: @@ -618,27 +592,27 @@ def _union(self, other, sort): and (start_s - end_o) <= step_s and (start_o - end_s) <= step_s ): - return self.__class__(start_r, end_r + step_s, step_s) + return type(self)(start_r, end_r + step_s, step_s) if ( (step_s % 2 == 0) and (abs(start_s - start_o) <= step_s / 2) and (abs(end_s - end_o) <= step_s / 2) ): - return self.__class__(start_r, end_r + step_s / 2, step_s / 2) + return type(self)(start_r, end_r + step_s / 2, step_s / 2) elif step_o % step_s == 0: if ( (start_o - start_s) % step_s == 0 and (start_o + step_s >= start_s) and (end_o - step_s <= end_s) ): - return self.__class__(start_r, end_r + step_s, step_s) + return type(self)(start_r, end_r + step_s, step_s) elif step_s % step_o == 0: if ( (start_s - start_o) % step_o == 0 and (start_s + step_o >= start_o) and (end_s - step_o <= end_o) ): - return self.__class__(start_r, end_r + step_o, step_o) + return type(self)(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) @Appender(_index_shared_docs["join"]) @@ -663,7 +637,7 @@ def _concat_same_dtype(self, indexes, name): non_empty_indexes = [obj for obj in indexes if len(obj)] for obj in non_empty_indexes: - rng = obj._range # type: range + rng: range = obj._range if start is None: # This is set by the first non-empty index @@ -698,14 +672,14 @@ def _concat_same_dtype(self, indexes, name): # In this case return an empty range index. return RangeIndex(0, 0).rename(name) - def __len__(self): + def __len__(self) -> int: """ return the length of the RangeIndex """ return len(self._range) @property - def size(self): + def size(self) -> int: return len(self) def __getitem__(self, key): @@ -721,8 +695,7 @@ def __getitem__(self, key): return self._range[new_key] except IndexError: raise IndexError( - "index {key} is out of bounds for axis 0 " - "with size {size}".format(key=key, size=len(self)) + f"index {key} is out of bounds for axis 0 with size {len(self)}" ) elif is_scalar(key): raise IndexError( @@ -734,9 +707,8 @@ def __getitem__(self, key): # fall back to Int64Index return super().__getitem__(key) + @unpack_zerodim_and_defer("__floordiv__") def __floordiv__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented if is_integer(other) and other != 0: if len(self) == 0 or self.start % other == 0 and self.step % other == 0: @@ -772,10 +744,9 @@ def _make_evaluate_binop(op, step=False): if False, use the existing step """ + @unpack_zerodim_and_defer(op.__name__) def _evaluate_numeric_binop(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - elif isinstance(other, ABCTimedeltaIndex): + if isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): @@ -809,7 +780,7 @@ def _evaluate_numeric_binop(self, other): rstart = op(left.start, right) rstop = op(left.stop, right) - result = self.__class__(rstart, rstop, rstep, **attrs) + result = type(self)(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return @@ -824,7 +795,7 @@ def _evaluate_numeric_binop(self, other): return op(self._int64index, other) # TODO: Do attrs get handled reliably? - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return compat.set_function_name(_evaluate_numeric_binop, name, cls) cls.__add__ = _make_evaluate_binop(operator.add) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 2324b8cf74c46..1f3182bc83e1d 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,15 +1,13 @@ """ implement the TimedeltaIndex """ from datetime import datetime -import warnings import numpy as np -from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib +from pandas._libs import NaT, Timedelta, index as libindex from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( _TD_DTYPE, - ensure_int64, is_float, is_integer, is_list_like, @@ -18,22 +16,20 @@ is_timedelta64_ns_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core.accessor import delegate_names from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray, _is_convertible_to_td from pandas.core.base import _shared_docs import pandas.core.common as com -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.datetimelike import ( DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, - ea_passthrough, + DatetimeTimedeltaMixin, ) -from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops import get_op_result_name +from pandas.core.indexes.extension import inherit_names from pandas.tseries.frequencies import to_offset @@ -43,18 +39,28 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we don't want to expose in the .dt accessor. - _delegate_class = TimedeltaArray - _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] - _delegated_methods = TimedeltaArray._datetimelike_methods + [ - "_box_values", - "__neg__", - "__pos__", - "__abs__", - ] - _raw_properties = {"components"} - _raw_methods = {"to_pytimedelta"} + _raw_properties = {"components", "_box_func"} + _raw_methods = {"to_pytimedelta", "sum", "std", "median", "_format_native_types"} + + _delegated_properties = TimedeltaArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = ( + TimedeltaArray._datetimelike_methods + + list(_raw_methods) + + ["_box_values", "__neg__", "__pos__", "__abs__"] + ) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + "_other_ops", + ], + TimedeltaArray, +) @delegate_names( TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" ) @@ -65,7 +71,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): overwrite=True, ) class TimedeltaIndex( - DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin + DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin, ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -83,30 +89,6 @@ class TimedeltaIndex( inferred frequency upon creation. copy : bool Make a copy of input ndarray. - start : starting value, timedelta-like, optional - If data is None, start is used as the start point in generating regular - timedelta data. - - .. deprecated:: 0.24.0 - - periods : int, optional, > 0 - Number of periods to generate, if generating index. Takes precedence - over end argument. - - .. deprecated:: 0.24.0 - - end : end time, timedelta-like, optional - If periods is none, generated index will extend to first conforming - time on or just past end argument. - - .. deprecated:: 0.24. 0 - - closed : str or None, default None - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None). - - .. deprecated:: 0.24. 0 - name : object Name to be stored in the index. @@ -140,24 +122,10 @@ class TimedeltaIndex( Notes ----- To learn more about the frequency strings, please see `this link - `__. - - Creating a TimedeltaIndex based on `start`, `periods`, and `end` has - been deprecated in favor of :func:`timedelta_range`. + `__. """ _typ = "timedeltaindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.TimedeltaEngine @@ -166,18 +134,6 @@ def _join_i8_wrapper(joinf, **kwargs): _is_numeric_dtype = True _infer_as_myclass = True - _freq = None - - _bool_ops = TimedeltaArray._bool_ops - _object_ops = TimedeltaArray._object_ops - _field_ops = TimedeltaArray._field_ops - _datetimelike_ops = TimedeltaArray._datetimelike_ops - _datetimelike_methods = TimedeltaArray._datetimelike_methods - _other_ops = TimedeltaArray._other_ops - sum = ea_passthrough(TimedeltaArray.sum) - std = ea_passthrough(TimedeltaArray.std) - median = ea_passthrough(TimedeltaArray.median) - # ------------------------------------------------------------------- # Constructors @@ -186,54 +142,23 @@ def __new__( data=None, unit=None, freq=None, - start=None, - end=None, - periods=None, closed=None, dtype=_TD_DTYPE, copy=False, name=None, - verify_integrity=None, ): - - if verify_integrity is not None: - warnings.warn( - "The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - verify_integrity = True - - if data is None: - freq, freq_infer = dtl.maybe_infer_freq(freq) - warnings.warn( - "Creating a TimedeltaIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.timedelta_range` instead.", - FutureWarning, - stacklevel=2, - ) - result = TimedeltaArray._generate_range( - start, end, periods, freq, closed=closed - ) - return cls._simple_new(result._data, freq=freq, name=name) + name = maybe_extract_name(name, data, cls) if is_scalar(data): raise TypeError( - "{cls}() must be called with a " - "collection of some kind, {data} was passed".format( - cls=cls.__name__, data=repr(data) - ) + f"{cls.__name__}() must be called with a " + f"collection of some kind, {repr(data)} was passed" ) if unit in {"Y", "y", "M"}: - warnings.warn( - "M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=2, + raise ValueError( + "Units 'M' and 'Y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." ) if isinstance(data, TimedeltaArray): @@ -270,24 +195,13 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): tdarr = TimedeltaArray._simple_new(values._data, freq=freq) result = object.__new__(cls) result._data = tdarr - result.name = name + result._name = name # For groupby perf. See note in indexes/base about _index_data result._index_data = tdarr._data result._reset_identity() return result - # ------------------------------------------------------------------- - - def __setstate__(self, state): - """Necessary for making this object picklable""" - if isinstance(state, dict): - super().__setstate__(state) - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - # ------------------------------------------------------------------- # Rendering Methods @@ -297,33 +211,6 @@ def _formatter_func(self): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import Timedelta64Formatter - - return np.asarray( - Timedelta64Formatter( - values=self, nat_rep=na_rep, justify="all" - ).get_result() - ) - - # ------------------------------------------------------------------- - # Wrapping TimedeltaArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - @property - def _box_func(self): - return lambda x: Timedelta(x, unit="ns") - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - return type(self)(result, name=self.name) - # ------------------------------------------------------------------- @Appender(_index_shared_docs["astype"]) @@ -339,137 +226,6 @@ def astype(self, dtype, copy=True): return Index(result.astype("i8"), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def _union(self, other, sort): - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super()._union(other, sort=sort) - - if not isinstance(other, TimedeltaIndex): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - this, other = self, other - - if this._can_fast_union(other): - return this._fast_union(other) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, TimedeltaIndex): - if result.freq is None: - result.freq = to_offset(result.inferred_freq) - return result - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - if _is_convertible_to_index(other): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - - return Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def intersection(self, other, sort=False): - """ - Specialized intersection for TimedeltaIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : TimedeltaIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - .. versionchanged:: 0.25.0 - - The `sort` keyword is added - - Returns - ------- - y : Index or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, TimedeltaIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined, name=name) - return joined - else: - return self._simple_new(joined, name) - - def _can_fast_union(self, other): - if not isinstance(other, TimedeltaIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - return (right_start == left_end + freq) or right_start in left - - def _fast_union(self, other): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # concatenate - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - def _maybe_promote(self, other): if other.inferred_type == "timedelta": other = TimedeltaIndex(other) @@ -486,7 +242,7 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) + value = Index.get_value(self, series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -498,10 +254,10 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) except (TypeError, ValueError, KeyError): raise KeyError(key) + else: + return com.maybe_box(self, value, series, key) - def get_value_maybe_box(self, series, key): - if not isinstance(key, Timedelta): - key = Timedelta(key) + def get_value_maybe_box(self, series, key: Timedelta): values = self._engine.get_value(com.values_from_object(series), key) return com.maybe_box(self, values, series, key) @@ -592,23 +348,33 @@ def _partial_td_slice(self, key): @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_TD_DTYPE, copy=False) - else: - value = Timedelta(value).asm8.view(_TD_DTYPE) + if not type(self._data)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self._data)(value) + self._data._check_compatible_with(value) + + elif isinstance(value, self._data._recognized_scalars): + self._data._check_compatible_with(value) + value = self._data._scalar_type(value) + + elif not isinstance(value, TimedeltaArray): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) - return self.values.searchsorted(value, side=side, sorter=sorter) + return self._data.searchsorted(value, side=side, sorter=sorter) - def is_type_compatible(self, typ): + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @property - def inferred_type(self): + def inferred_type(self) -> str: return "timedelta64" - @property - def is_all_dates(self): - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -625,95 +391,52 @@ def insert(self, loc, item): new_index : Index """ # try to convert if possible - if _is_convertible_to_td(item): - try: - item = Timedelta(item) - except ValueError: - # e.g. str that can't be parsed to timedelta - pass - elif is_scalar(item) and isna(item): + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): # GH 18295 item = self._na_value + elif is_scalar(item) and isna(item): + # i.e. datetime64("NaT") + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) freq = None - if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) # check freq can be preserved on edge cases - if self.freq is not None: - if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq - item = Timedelta(item).asm8.view(_TD_DTYPE) + item = item.asm8 try: - new_tds = np.concatenate( + new_i8s = np.concatenate( (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) ) - return self._shallow_copy(new_tds, freq=freq) - + return self._shallow_copy(new_i8s, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError("cannot insert TimedeltaIndex with incompatible label") - - def delete(self, loc): - """ - Make a new TimedeltaIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : TimedeltaIndex - """ - new_tds = np.delete(self.asi8, loc) - - freq = "infer" - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return TimedeltaIndex(new_tds, name=self.name, freq=freq) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) -TimedeltaIndex._add_comparison_ops() TimedeltaIndex._add_logical_methods_disabled() -TimedeltaIndex._add_datetimelike_methods() - - -def _is_convertible_to_index(other): - """ - return a boolean whether I can attempt conversion to a TimedeltaIndex - """ - if isinstance(other, TimedeltaIndex): - return True - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None -): +) -> TimedeltaIndex: """ Return a fixed frequency TimedeltaIndex, with day as the default frequency. @@ -746,7 +469,7 @@ def timedelta_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 44c786f003369..ea59a6a49e649 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,6 +1,4 @@ -import textwrap -from typing import Tuple -import warnings +from typing import Hashable, List, Tuple, Union import numpy as np @@ -10,10 +8,8 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - ensure_platform_int, is_float, is_integer, - is_integer_dtype, is_iterator, is_list_like, is_numeric_dtype, @@ -26,21 +22,12 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, InvalidIndexError -from pandas.core.indexers import is_list_like_indexer, length_of_indexer - - -# the supported indexers -def get_indexers_list(): - - return [ - ("ix", _IXIndexer), - ("iloc", _iLocIndexer), - ("loc", _LocIndexer), - ("at", _AtIndexer), - ("iat", _iAtIndexer), - ] - +from pandas.core.indexers import ( + check_bool_array_indexer, + is_list_like_indexer, + length_of_indexer, +) +from pandas.core.indexes.api import Index, InvalidIndexError # "null slice" _NS = slice(None, None) @@ -99,23 +86,500 @@ class IndexingError(Exception): pass +class IndexingMixin: + """Mixin for adding .loc/.iloc/.at/.iat to Datafames and Series. + """ + + @property + def iloc(self) -> "_iLocIndexer": + """ + Purely integer-location based indexing for selection by position. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on some value. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> "_LocIndexer": + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label ` + + Raises + ------ + KeyError + If any items are not found. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> "_AtIndexer": + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If 'label' does not exist in DataFrame. + + See Also + -------- + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> "_iAtIndexer": + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + class _NDFrameIndexer(_NDFrameIndexerBase): - _valid_types = None # type: str - _exception = Exception + _valid_types: str axis = None def __call__(self, axis=None): # we need to return a copy of ourselves - new_self = self.__class__(self.name, self.obj) + new_self = type(self)(self.name, self.obj) if axis is not None: axis = self.obj._get_axis_number(axis) new_self.axis = axis return new_self - def __iter__(self): - raise NotImplementedError("ix is not iterable") - + # TODO: remove once geopandas no longer needs this def __getitem__(self, key): # Used in ix and downstream in geopandas _CoordinateIndexer if type(key) is tuple: @@ -213,23 +677,25 @@ def _validate_key(self, key, axis: int): Parameters ---------- key : scalar, slice or list-like - The key requested + Key requested. axis : int - Dimension on which the indexing is being made + Dimension on which the indexing is being made. Raises ------ TypeError - If the key (or some element of it) has wrong type + If the key (or some element of it) has wrong type. IndexError - If the key (or some element of it) is out of bounds + If the key (or some element of it) is out of bounds. KeyError - If the key was not found + If the key was not found. """ raise AbstractMethodError(self) def _has_valid_tuple(self, key: Tuple): - """ check the key for valid keys across my indexer """ + """ + Check the key for valid keys across my indexer. + """ for i, k in enumerate(key): if i >= self.ndim: raise IndexingError("Too many indexers") @@ -238,10 +704,15 @@ def _has_valid_tuple(self, key: Tuple): except ValueError: raise ValueError( "Location based indexing can only have " - "[{types}] types".format(types=self._valid_types) + f"[{self._valid_types}] types" ) - def _is_nested_tuple_indexer(self, tup: Tuple): + def _is_nested_tuple_indexer(self, tup: Tuple) -> bool: + """ + Returns + ------- + bool + """ if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -274,15 +745,20 @@ def _convert_slice_indexer(self, key: slice, axis: int): ax = self.obj._get_axis(min(axis, self.ndim - 1)) return ax._convert_slice_indexer(key, kind=self.name) - def _has_valid_setitem_indexer(self, indexer): + def _has_valid_setitem_indexer(self, indexer) -> bool: return True - def _has_valid_positional_setitem_indexer(self, indexer): - """ validate that an positional indexer cannot enlarge its target - will raise if needed, does not modify the indexer externally + def _has_valid_positional_setitem_indexer(self, indexer) -> bool: + """ + Validate that a positional indexer cannot enlarge its target + will raise if needed, does not modify the indexer externally. + + Returns + ------- + bool """ if isinstance(indexer, dict): - raise IndexError("{0} cannot enlarge its target object".format(self.name)) + raise IndexError(f"{self.name} cannot enlarge its target object") else: if not isinstance(indexer, tuple): indexer = _tuplify(self.ndim, indexer) @@ -296,13 +772,10 @@ def _has_valid_positional_setitem_indexer(self, indexer): elif is_integer(i): if i >= len(ax): raise IndexError( - "{name} cannot enlarge its target " - "object".format(name=self.name) + f"{self.name} cannot enlarge its target object" ) elif isinstance(i, dict): - raise IndexError( - "{name} cannot enlarge its target object".format(name=self.name) - ) + raise IndexError(f"{self.name} cannot enlarge its target object") return True @@ -320,7 +793,7 @@ def _setitem_with_indexer(self, indexer, value): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._data.blocks: - blk, = self.obj._data.blocks + (blk,) = self.obj._data.blocks if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) @@ -655,10 +1128,9 @@ def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = Fals Parameters ---------- indexer : tuple, slice, scalar - The indexer used to get the locations that will be set to - `ser` + Indexer used to get the locations that will be set to `ser`. ser : pd.Series - The values to assign to the locations specified by `indexer` + Values to assign to the locations specified by `indexer`. multiindex_indexer : boolean, optional Defaults to False. Should be set to True if `indexer` was from a `pd.MultiIndex`, to avoid unnecessary broadcasting. @@ -817,20 +1289,23 @@ def _getitem_tuple(self, tup: Tuple): return retval - def _multi_take_opportunity(self, tup: Tuple): + def _multi_take_opportunity(self, tup: Tuple) -> bool: """ Check whether there is the possibility to use ``_multi_take``. - Currently the limit is that all axes being indexed must be indexed with + + Currently the limit is that all axes being indexed, must be indexed with list-likes. Parameters ---------- tup : tuple - Tuple of indexers, one per axis + Tuple of indexers, one per axis. Returns ------- - boolean: Whether the current indexing can be passed through _multi_take + bool + Whether the current indexing, + can be passed through `_multi_take`. """ if not all(is_list_like_indexer(x) for x in tup): return False @@ -843,14 +1318,15 @@ def _multi_take_opportunity(self, tup: Tuple): def _multi_take(self, tup: Tuple): """ - Create the indexers for the passed tuple of keys, and execute the take - operation. This allows the take operation to be executed all at once - - rather than once for each dimension - improving efficiency. + Create the indexers for the passed tuple of keys, and + executes the take operation. This allows the take operation to be + executed all at once, rather than once for each dimension. + Improving efficiency. Parameters ---------- tup : tuple - Tuple of indexers, one per axis + Tuple of indexers, one per axis. Returns ------- @@ -881,14 +1357,6 @@ def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): # else IndexingError will be raised if len(tup) <= self.obj.index.nlevels and len(tup) > self.ndim: raise ek - except Exception as e1: - if isinstance(tup[0], (slice, Index)): - raise IndexingError("Handle elsewhere") - - # raise the error if we are not sorted - ax0 = self.obj._get_axis(0) - if not ax0.is_lexsorted_for_tuple(tup): - raise e1 return None @@ -915,9 +1383,6 @@ def _getitem_lowerdim(self, tup: Tuple): if len(tup) > self.ndim: raise IndexingError("Too many indexers. handle elsewhere") - # to avoid wasted computation - # df.ix[d1:d2, 0] -> columns first (True) - # df.ix[0, ['C', 'B', A']] -> rows first (False) for i, key in enumerate(tup): if is_label_like(key) or isinstance(key, tuple): section = self._getitem_axis(key, axis=i) @@ -998,6 +1463,7 @@ def _getitem_nested_tuple(self, tup: Tuple): return obj + # TODO: remove once geopandas no longer needs __getitem__ def _getitem_axis(self, key, axis: int): if is_iterator(key): key = list(key) @@ -1040,13 +1506,13 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): Parameters ---------- key : list-like - Target labels + Targeted labels. axis: int - Dimension on which the indexing is being made - raise_missing: bool - Whether to raise a KeyError if some labels are not found. Will be - removed in the future, and then this method will always behave as - if raise_missing=True. + Dimension on which the indexing is being made. + raise_missing: bool, default False + Whether to raise a KeyError if some labels were not found. + Will be removed in the future, and then this method will always behave as + if ``raise_missing=True``. Raises ------ @@ -1057,9 +1523,9 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): Returns ------- keyarr: Index - New index (coinciding with 'key' if the axis is unique) + New index (coinciding with 'key' if the axis is unique). values : array-like - An indexer for the return object; -1 denotes keys not found + Indexer for the return object, -1 denotes keys not found. """ o = self.obj ax = o._get_axis(axis) @@ -1089,15 +1555,16 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): def _getitem_iterable(self, key, axis: int): """ - Index current object with an an iterable key (which can be a boolean - indexer, or a collection of keys). + Index current object with an an iterable key. + + The iterable key can be a boolean indexer or a collection of keys. Parameters ---------- key : iterable - Target labels, or boolean indexer + Targeted labels or boolean indexer. axis: int - Dimension on which the indexing is being made + Dimension on which the indexing is being made. Raises ------ @@ -1110,7 +1577,7 @@ def _getitem_iterable(self, key, axis: int): Returns ------- - scalar, DataFrame, or Series: indexed value(s), + scalar, DataFrame, or Series: indexed value(s). """ # caller is responsible for ensuring non-None axis self._validate_key(key, axis) @@ -1120,7 +1587,7 @@ def _getitem_iterable(self, key, axis: int): if com.is_bool_indexer(key): # A boolean indexer key = check_bool_indexer(labels, key) - inds, = key.nonzero() + (inds,) = key.nonzero() return self.obj.take(inds, axis=axis) else: # A collection of keys @@ -1133,17 +1600,20 @@ def _validate_read_indexer( self, key, indexer, axis: int, raise_missing: bool = False ): """ - Check that indexer can be used to return a result (e.g. at least one - element was found, unless the list of keys was actually empty). + Check that indexer can be used to return a result. + + e.g. at least one element was found, + unless the list of keys was actually empty. Parameters ---------- key : list-like - Target labels (only used to show correct error message) + Targeted labels (only used to show correct error message). indexer: array-like of booleans - Indices corresponding to the key (with -1 indicating not found) + Indices corresponding to the key, + (with -1 indicating not found). axis: int - Dimension on which the indexing is being made + Dimension on which the indexing is being made. raise_missing: bool Whether to raise a KeyError if some labels are not found. Will be removed in the future, and then this method will always behave as @@ -1155,7 +1625,6 @@ def _validate_read_indexer( If at least one key was requested but none was found, and raise_missing=True. """ - ax = self.obj._get_axis(axis) if len(key) == 0: @@ -1166,40 +1635,31 @@ def _validate_read_indexer( if missing: if missing == len(indexer): - raise KeyError( - "None of [{key}] are in the [{axis}]".format( - key=key, axis=self.obj._get_axis_name(axis) - ) - ) + axis_name = self.obj._get_axis_name(axis) + raise KeyError(f"None of [{key}] are in the [{axis_name}]") # We (temporarily) allow for some missing keys with .loc, except in # some cases (e.g. setting) in which "raise_missing" will be False if not (self.name == "loc" and not raise_missing): not_found = list(set(key) - set(ax)) - raise KeyError("{} not in index".format(not_found)) + raise KeyError(f"{not_found} not in index") # we skip the warning on Categorical/Interval # as this check is actually done (check for # non-missing values), but a bit later in the # code, so we want to avoid warning & then # just raising - - _missing_key_warning = textwrap.dedent( - """ - Passing list-likes to .loc or [] with any missing label will raise - KeyError in the future, you can use .reindex() as an alternative. - - See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""" # noqa: E501 - ) - if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported, see " + "https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): """ Convert indexing key into something we can use to do actual fancy - indexing on an ndarray + indexing on a ndarray. Examples ix[:5] -> slice(0, 5) @@ -1264,7 +1724,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): if com.is_bool_indexer(obj): obj = check_bool_indexer(labels, obj) - inds, = obj.nonzero() + (inds,) = obj.nonzero() return inds else: # When setting, missing keys are not allowed, even with .loc: @@ -1289,104 +1749,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): return self._slice(indexer, axis=axis, kind="iloc") -class _IXIndexer(_NDFrameIndexer): - """ - A primarily label-location based indexer, with integer position fallback. - - Warning: Starting in 0.20.0, the .ix indexer is deprecated, in - favor of the more strict .iloc and .loc indexers. - - ``.ix[]`` supports mixed integer and label based access. It is - primarily label based, but will fall back to integer positional - access unless the corresponding axis is of integer type. - - ``.ix`` is the most general indexer and will support any of the - inputs in ``.loc`` and ``.iloc``. ``.ix`` also supports floating - point label schemes. ``.ix`` is exceptionally useful when dealing - with mixed positional and label based hierarchical indexes. - - However, when an axis is integer based, ONLY label based access - and not positional access is supported. Thus, in such cases, it's - usually better to be explicit and use ``.iloc`` or ``.loc``. - - See more at :ref:`Advanced Indexing `. - """ - - _ix_deprecation_warning = textwrap.dedent( - """ - .ix is deprecated. Please use - .loc for label based indexing or - .iloc for positional indexing - - See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""" # noqa: E501 - ) - - def __init__(self, name, obj): - warnings.warn(self._ix_deprecation_warning, FutureWarning, stacklevel=2) - super().__init__(name, obj) - - @Appender(_NDFrameIndexer._validate_key.__doc__) - def _validate_key(self, key, axis: int): - if isinstance(key, slice): - return True - - elif com.is_bool_indexer(key): - return True - - elif is_list_like_indexer(key): - return True - - else: - - self._convert_scalar_indexer(key, axis) - - return True - - def _convert_for_reindex(self, key, axis: int): - """ - Transform a list of keys into a new array ready to be used as axis of - the object we return (e.g. including NaNs). - - Parameters - ---------- - key : list-like - Target labels - axis: int - Where the indexing is being made - - Returns - ------- - list-like of labels - """ - labels = self.obj._get_axis(axis) - - if com.is_bool_indexer(key): - key = check_bool_indexer(labels, key) - return labels[key] - - if isinstance(key, Index): - keyarr = labels._convert_index_indexer(key) - else: - # asarray can be unsafe, NumPy strings are weird - keyarr = com.asarray_tuplesafe(key) - - if is_integer_dtype(keyarr): - # Cast the indexer to uint64 if possible so - # that the values returned from indexing are - # also uint64. - keyarr = labels._convert_arr_indexer(keyarr) - - if not labels.is_integer(): - keyarr = ensure_platform_int(keyarr) - return labels.take(keyarr) - - return keyarr - - class _LocationIndexer(_NDFrameIndexer): - _exception = Exception - def __getitem__(self, key): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) @@ -1417,13 +1780,12 @@ def _getbool_axis(self, key, axis: int): labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds = key.nonzero()[0] - try: - return self.obj.take(inds, axis=axis) - except Exception as detail: - raise self._exception(detail) + return self.obj.take(inds, axis=axis) def _get_slice_axis(self, slice_obj: slice, axis: int): - """ this is pretty simple as we just have to deal with labels """ + """ + This is pretty simple as we just have to deal with labels. + """ # caller is responsible for ensuring non-None axis obj = self.obj if not need_slice(slice_obj): @@ -1442,250 +1804,13 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): return self.obj.take(indexer, axis=axis) +@Appender(IndexingMixin.loc.__doc__) class _LocIndexer(_LocationIndexer): - """ - Access a group of rows and columns by label(s) or a boolean array. - - ``.loc[]`` is primarily label based, but may also be used with a - boolean array. - - Allowed inputs are: - - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is - interpreted as a *label* of the index, and **never** as an - integer position along the index). - - A list or array of labels, e.g. ``['a', 'b', 'c']``. - - A slice object with labels, e.g. ``'a':'f'``. - - .. warning:: Note that contrary to usual python slices, **both** the - start and the stop are included - - - A boolean array of the same length as the axis being sliced, - e.g. ``[True, False, True]``. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above) - - See more at :ref:`Selection by Label ` - - Raises - ------ - KeyError: - when any items are not found - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.iloc : Access group of rows and columns by integer position(s). - DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. - Series.loc : Access group of values using labels. - - Examples - -------- - **Getting values** - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) - >>> df - max_speed shield - cobra 1 2 - viper 4 5 - sidewinder 7 8 - - Single label. Note this returns the row as a Series. - - >>> df.loc['viper'] - max_speed 4 - shield 5 - Name: viper, dtype: int64 - - List of labels. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[['viper', 'sidewinder']] - max_speed shield - viper 4 5 - sidewinder 7 8 - - Single label for row and column - - >>> df.loc['cobra', 'shield'] - 2 - - Slice with labels for row and single label for column. As mentioned - above, note that both the start and stop of the slice are included. - - >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - Name: max_speed, dtype: int64 - - Boolean list with the same length as the row axis - - >>> df.loc[[False, False, True]] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series - - >>> df.loc[df['shield'] > 6] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series with column labels specified - - >>> df.loc[df['shield'] > 6, ['max_speed']] - max_speed - sidewinder 7 - - Callable that returns a boolean Series - - >>> df.loc[lambda df: df['shield'] == 8] - max_speed shield - sidewinder 7 8 - - **Setting values** - - Set value for all items matching the list of labels - - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 - >>> df - max_speed shield - cobra 1 2 - viper 4 50 - sidewinder 7 50 - - Set value for an entire row - - >>> df.loc['cobra'] = 10 - >>> df - max_speed shield - cobra 10 10 - viper 4 50 - sidewinder 7 50 - - Set value for an entire column - - >>> df.loc[:, 'max_speed'] = 30 - >>> df - max_speed shield - cobra 30 10 - viper 30 50 - sidewinder 30 50 - - Set value for rows matching callable condition - - >>> df.loc[df['shield'] > 35] = 0 - >>> df - max_speed shield - cobra 30 10 - viper 0 0 - sidewinder 0 0 - - **Getting values on a DataFrame with an index that has integer labels** - - Another example using integers for the index - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) - >>> df - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - Slice with integer labels for rows. As mentioned above, note that both - the start and stop of the slice are included. - - >>> df.loc[7:9] - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - **Getting values with a MultiIndex** - - A number of examples using a DataFrame with a MultiIndex - - >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') - ... ] - >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) - >>> df - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Single label. Note this returns a DataFrame with a single index. - - >>> df.loc['cobra'] - max_speed shield - mark i 12 2 - mark ii 0 4 - - Single index tuple. Note this returns a Series. - - >>> df.loc[('cobra', 'mark ii')] - max_speed 0 - shield 4 - Name: (cobra, mark ii), dtype: int64 - - Single label for row and column. Similar to passing in a tuple, this - returns a Series. - - >>> df.loc['cobra', 'mark i'] - max_speed 12 - shield 2 - Name: (cobra, mark i), dtype: int64 - - Single tuple. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[[('cobra', 'mark ii')]] - max_speed shield - cobra mark ii 0 4 - - Single tuple for the index with a single label for the column - - >>> df.loc[('cobra', 'mark i'), 'shield'] - 2 - - Slice from index tuple to single label - - >>> df.loc[('cobra', 'mark i'):'viper'] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Slice from index tuple to index tuple - - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - """ - _valid_types = ( "labels (MUST BE IN THE INDEX), slices of labels (BOTH " "endpoints included! Can be slices of integers if the " "index is integers), listlike of labels, boolean" ) - _exception = KeyError @Appender(_NDFrameIndexer._validate_key.__doc__) def _validate_key(self, key, axis: int): @@ -1704,7 +1829,12 @@ def _validate_key(self, key, axis: int): if not is_list_like_indexer(key): self._convert_scalar_indexer(key, axis) - def _is_scalar_access(self, key: Tuple): + def _is_scalar_access(self, key: Tuple) -> bool: + """ + Returns + ------- + bool + """ # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) @@ -1737,8 +1867,12 @@ def _getitem_scalar(self, key): return values def _get_partial_string_timestamp_match_key(self, key, labels): - """Translate any partial string timestamp matches in key, returning the - new key (GH 10331)""" + """ + Translate any partial string timestamp matches in key, returning the + new key. + + (GH 10331) + """ if isinstance(labels, ABCMultiIndex): if ( isinstance(key, str) @@ -1830,147 +1964,12 @@ def _getitem_axis(self, key, axis: int): return self._get_label(key, axis=axis) +@Appender(IndexingMixin.iloc.__doc__) class _iLocIndexer(_LocationIndexer): - """ - Purely integer-location based indexing for selection by position. - - ``.iloc[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), but may also be used with a boolean - array. - - Allowed inputs are: - - - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - This is useful in method chains, when you don't have a reference to the - calling object, but would like to base your selection on some value. - - ``.iloc`` will raise ``IndexError`` if a requested indexer is - out-of-bounds, except *slice* indexers which allow out-of-bounds - indexing (this conforms with python/numpy *slice* semantics). - - See more at :ref:`Selection by Position `. - - See Also - -------- - DataFrame.iat : Fast integer location scalar accessor. - DataFrame.loc : Purely label-location based indexer for selection by label. - Series.iloc : Purely integer-location based indexing for - selection by position. - - Examples - -------- - - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] - >>> df = pd.DataFrame(mydict) - >>> df - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - **Indexing just the rows** - - With a scalar integer. - - >>> type(df.iloc[0]) - - >>> df.iloc[0] - a 1 - b 2 - c 3 - d 4 - Name: 0, dtype: int64 - - With a list of integers. - - >>> df.iloc[[0]] - a b c d - 0 1 2 3 4 - >>> type(df.iloc[[0]]) - - - >>> df.iloc[[0, 1]] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - - With a `slice` object. - - >>> df.iloc[:3] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - With a boolean mask the same length as the index. - - >>> df.iloc[[True, False, True]] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - With a callable, useful in method chains. The `x` passed - to the ``lambda`` is the DataFrame being sliced. This selects - the rows whose index label even. - - >>> df.iloc[lambda x: x.index % 2 == 0] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - **Indexing both axes** - - You can mix the indexer types for the index and columns. Use ``:`` to - select the entire axis. - - With scalar integers. - - >>> df.iloc[0, 1] - 2 - - With lists of integers. - - >>> df.iloc[[0, 2], [1, 3]] - b d - 0 2 4 - 2 2000 4000 - - With `slice` objects. - - >>> df.iloc[1:3, 0:3] - a b c - 1 100 200 300 - 2 1000 2000 3000 - - With a boolean array whose length matches the columns. - - >>> df.iloc[:, [True, False, True, False]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - - With a callable function that expects the Series or DataFrame. - - >>> df.iloc[:, lambda df: [0, 2]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - """ - _valid_types = ( "integer, integer slice (START point is INCLUDED, END " "point is EXCLUDED), listlike of integers, boolean array" ) - _exception = IndexError _get_slice_axis = _NDFrameIndexer._get_slice_axis def _validate_key(self, key, axis: int): @@ -2002,23 +2001,23 @@ def _validate_key(self, key, axis: int): # check that the key has a numeric dtype if not is_numeric_dtype(arr.dtype): - raise IndexError( - ".iloc requires numeric indexers, got {arr}".format(arr=arr) - ) + raise IndexError(f".iloc requires numeric indexers, got {arr}") # check that the key does not exceed the maximum size of the index if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): raise IndexError("positional indexers are out-of-bounds") else: - raise ValueError( - "Can only index by location with " - "a [{types}]".format(types=self._valid_types) - ) + raise ValueError(f"Can only index by location with a [{self._valid_types}]") def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) - def _is_scalar_access(self, key: Tuple): + def _is_scalar_access(self, key: Tuple) -> bool: + """ + Returns + ------- + bool + """ # this is a shortcut accessor to both .loc and .iloc # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) @@ -2042,27 +2041,22 @@ def _getitem_scalar(self, key): values = self.obj._get_value(*key, takeable=True) return values - def _validate_integer(self, key: int, axis: int): + def _validate_integer(self, key: int, axis: int) -> None: """ Check that 'key' is a valid position in the desired axis. Parameters ---------- key : int - Requested position + Requested position. axis : int - Desired axis - - Returns - ------- - None + Desired axis. Raises ------ IndexError - If 'key' is not a valid position in axis 'axis' + If 'key' is not a valid position in axis 'axis'. """ - len_axis = len(self.obj._get_axis(axis)) if key >= len_axis or key < -len_axis: raise IndexError("single positional indexer is out-of-bounds") @@ -2097,16 +2091,20 @@ def _getitem_tuple(self, tup: Tuple): def _get_list_axis(self, key, axis: int): """ - Return Series values by list or array of integers + Return Series values by list or array of integers. Parameters ---------- key : list-like positional indexer - axis : int (can only be zero) + axis : int Returns ------- Series object + + Notes + ----- + `axis` can only be zero. """ try: return self.obj.take(key, axis=axis) @@ -2142,8 +2140,9 @@ def _getitem_axis(self, key, axis: int): # raise_missing is included for compat with the parent class signature def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): - """ much simpler as we only have to deal with our valid types """ - + """ + Much simpler as we only have to deal with our valid types. + """ # make need to convert a float key if isinstance(obj, slice): return self._convert_slice_indexer(obj, axis) @@ -2155,14 +2154,13 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): self._validate_key(obj, axis) return obj except ValueError: - raise ValueError( - "Can only index by location with " - "a [{types}]".format(types=self._valid_types) - ) + raise ValueError(f"Can only index by location with a [{self._valid_types}]") class _ScalarAccessIndexer(_NDFrameIndexerBase): - """ access scalars quickly """ + """ + Access scalars quickly. + """ def _convert_key(self, key, is_setter: bool = False): raise AbstractMethodError(self) @@ -2195,60 +2193,15 @@ def __setitem__(self, key, value): self.obj._set_value(*key, takeable=self._takeable) +@Appender(IndexingMixin.at.__doc__) class _AtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column label pair. - - Similar to ``loc``, in that both provide label-based lookups. Use - ``at`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - KeyError - When label does not exist in DataFrame - - See Also - -------- - DataFrame.iat : Access a single value for a row/column pair by integer - position. - DataFrame.loc : Access a group of rows and columns by label(s). - Series.at : Access a single value using a label. - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) - >>> df - A B C - 4 0 2 3 - 5 0 4 1 - 6 10 20 30 - - Get value at specified row/column pair - - >>> df.at[4, 'B'] - 2 - - Set value at specified row/column pair - - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] - 10 - - Get value within a Series - - >>> df.loc[5].at['B'] - 4 - """ - _takeable = False def _convert_key(self, key, is_setter: bool = False): - """ require they keys to be the same type as the index (so we don't + """ + Require they keys to be the same type as the index. (so we don't fallback) """ - # allow arbitrary setting if is_setter: return list(key) @@ -2270,63 +2223,21 @@ def _convert_key(self, key, is_setter: bool = False): return key +@Appender(IndexingMixin.iat.__doc__) class _iAtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column pair by integer position. - - Similar to ``iloc``, in that both provide integer-based lookups. Use - ``iat`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - IndexError - When integer position is out of bounds - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.loc : Access a group of rows and columns by label(s). - DataFrame.iloc : Access a group of rows and columns by integer position(s). - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) - >>> df - A B C - 0 0 2 3 - 1 0 4 1 - 2 10 20 30 - - Get value at specified row/column pair - - >>> df.iat[1, 2] - 1 - - Set value at specified row/column pair - - >>> df.iat[1, 2] = 10 - >>> df.iat[1, 2] - 10 - - Get value within a series - - >>> df.loc[0].iat[1] - 2 - """ - _takeable = True def _convert_key(self, key, is_setter: bool = False): - """ require integer args (and convert to label arguments) """ + """ + Require integer args. (and convert to label arguments) + """ for a, i in zip(self.obj.axes, key): if not is_integer(i): raise ValueError("iAt based indexing can only have integer indexers") return key -def _tuplify(ndim: int, loc) -> tuple: +def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: """ Given an indexer for the first dimension, create an equivalent tuple for indexing over all dimensions. @@ -2340,14 +2251,15 @@ def _tuplify(ndim: int, loc) -> tuple: ------- tuple """ - tup = [slice(None, None) for _ in range(ndim)] - tup[0] = loc - return tuple(tup) + _tup: List[Union[Hashable, slice]] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) def convert_to_index_sliceable(obj, key): """ - if we are index sliceable, then return my slicer, otherwise return None + If we are index sliceable, then return my slicer, otherwise return None. """ idx = obj.index if isinstance(key, slice): @@ -2380,23 +2292,21 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: Parameters ---------- index : Index - Index of the object on which the indexing is done + Index of the object on which the indexing is done. key : list-like - Boolean indexer to check + Boolean indexer to check. Returns ------- - result: np.array - Resulting key + np.array + Resulting key. Raises ------ IndexError - If the key does not have the same length as index - + If the key does not have the same length as index. IndexingError - If the index of the key is unalignable to index - + If the index of the key is unalignable to index. """ result = key if isinstance(key, ABCSeries) and not key.index.equals(index): @@ -2412,23 +2322,16 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = np.asarray(result, dtype=bool) - - # GH26658 - if len(result) != len(index): - raise IndexError( - "Item wrong length {} instead of {}.".format(len(result), len(index)) - ) + result = check_bool_array_indexer(index, result) return result def convert_missing_indexer(indexer): """ - reverse convert a missing indexer, which is a dict + Reverse convert a missing indexer, which is a dict return the scalar indexer and a boolean indicating if we converted """ - if isinstance(indexer, dict): # a missing key (but not a tuple indexer) @@ -2443,7 +2346,7 @@ def convert_missing_indexer(indexer): def convert_from_missing_indexer_tuple(indexer, axes): """ - create a filtered indexer that doesn't have any missing indexers + Create a filtered indexer that doesn't have any missing indexers. """ def get_indexer(_i, _idx): @@ -2454,9 +2357,8 @@ def get_indexer(_i, _idx): def maybe_convert_ix(*args): """ - We likely want to take the cross-product + We likely want to take the cross-product. """ - ixify = True for arg in args: if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)): @@ -2468,7 +2370,12 @@ def maybe_convert_ix(*args): return args -def is_nested_tuple(tup, labels): +def is_nested_tuple(tup, labels) -> bool: + """ + Returns + ------- + bool + """ # check for a compatible nested tuple and multiindexes among the axes if not isinstance(tup, tuple): return False @@ -2481,12 +2388,22 @@ def is_nested_tuple(tup, labels): return False -def is_label_like(key): +def is_label_like(key) -> bool: + """ + Returns + ------- + bool + """ # select a label or row return not isinstance(key, slice) and not is_list_like_indexer(key) -def need_slice(obj): +def need_slice(obj) -> bool: + """ + Returns + ------- + bool + """ return ( obj.start is not None or obj.stop is not None @@ -2507,7 +2424,14 @@ def _non_reducing_slice(slice_): if isinstance(slice_, kinds): slice_ = IndexSlice[:, slice_] - def pred(part): + def pred(part) -> bool: + """ + Returns + ------- + bool + True if slice does *not* reduce, + False if `part` is a tuple. + """ # true when slice does *not* reduce, False when part is a tuple, # i.e. MultiIndex slice return (isinstance(part, slice) or is_list_like(part)) and not isinstance( @@ -2528,7 +2452,7 @@ def pred(part): def _maybe_numeric_slice(df, slice_, include_bool=False): """ - want nice defaults for background_gradient that don't break + Want nice defaults for background_gradient that don't break with non-numeric data. But if slice_ is passed go with that. """ if slice_ is None: @@ -2539,8 +2463,13 @@ def _maybe_numeric_slice(df, slice_, include_bool=False): return slice_ -def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj): - """ return True if we have an equal len settable """ +def _can_do_equal_len(labels, value, plane_indexer, lplane_indexer, obj) -> bool: + """ + Returns + ------- + bool + True if we have an equal len settable. + """ if not len(labels) == 1 or not np.iterable(value) or is_scalar(plane_indexer[0]): return False diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 8ac0df2fa4e0a..37a3405554745 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,4 @@ -from .blocks import ( # noqa: F401 +from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, CategoricalBlock, @@ -10,19 +10,38 @@ IntBlock, ObjectBlock, TimeDeltaBlock, + _block_shape, + _safe_reshape, + make_block, ) -from .managers import ( # noqa: F401 +from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks, -) - -from .blocks import _safe_reshape # noqa: F401; io.packers -from .blocks import make_block # noqa: F401; io.pytables, io.packers -from .managers import ( # noqa: F401; reshape.concat, reshape.merge _transform_index, concatenate_block_managers, + create_block_manager_from_arrays, + create_block_manager_from_blocks, ) -from .blocks import _block_shape # noqa:F401; io.pytables +__all__ = [ + "Block", + "BoolBlock", + "CategoricalBlock", + "ComplexBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "IntBlock", + "ObjectBlock", + "TimeDeltaBlock", + "_safe_reshape", + "make_block", + "_block_shape", + "BlockManager", + "SingleBlockManager", + "_transform_index", + "concatenate_block_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", +] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 51108d9a5a573..f74033924f64e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, lib, tslib, writers +from pandas._libs import NaT, algos as libalgos, lib, tslib, writers from pandas._libs.index import convert_scalar import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion @@ -37,7 +37,6 @@ is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_extension_type, is_float_dtype, is_integer, is_integer_dtype, @@ -116,8 +115,8 @@ def __init__(self, values, placement, ndim=None): if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( - "Wrong number of items passed {val}, placement implies " - "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs)) + f"Wrong number of items passed {len(self.values)}, " + f"placement implies {len(self.mgr_locs)}" ) def _check_ndim(self, values, ndim): @@ -145,9 +144,10 @@ def _check_ndim(self, values, ndim): ndim = values.ndim if self._validate_ndim and values.ndim != ndim: - msg = "Wrong number of dimensions. values.ndim != ndim [{} != {}]" - raise ValueError(msg.format(values.ndim, ndim)) - + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) return ndim @property @@ -185,7 +185,7 @@ def is_categorical_astype(self, dtype): if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing - raise TypeError("invalid type {0} for astype".format(dtype)) + raise TypeError(f"invalid type {dtype} for astype") elif is_categorical_dtype(dtype): return True @@ -242,7 +242,7 @@ def array_dtype(self): """ return self.dtype - def make_block(self, values, placement=None): + def make_block(self, values, placement=None) -> "Block": """ Create a new block, with type inference propagate any values that are not specified @@ -252,44 +252,32 @@ def make_block(self, values, placement=None): return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): + def make_block_same_class(self, values, placement=None, ndim=None): """ Wrap given values in a block of same type as self. """ - if dtype is not None: - # issue 19431 fastparquet is passing this - warnings.warn( - "dtype argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if placement is None: placement = self.mgr_locs if ndim is None: ndim = self.ndim - return make_block( - values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype - ) + return make_block(values, placement=placement, ndim=ndim, klass=type(self)) - def __repr__(self): + def __repr__(self) -> str: # don't want to print out all of the items here - name = pprint_thing(self.__class__.__name__) + name = type(self).__name__ if self._is_single_block: - result = "{name}: {len} dtype: {dtype}".format( - name=name, len=len(self), dtype=self.dtype - ) + result = f"{name}: {len(self)} dtype: {self.dtype}" else: shape = " x ".join(pprint_thing(s) for s in self.shape) - result = "{name}: {index}, {shape}, dtype: {dtype}".format( - name=name, - index=pprint_thing(self.mgr_locs.indexer), - shape=shape, - dtype=self.dtype, + result = ( + f"{name}: {pprint_thing(self.mgr_locs.indexer)}, " + f"{shape}, dtype: {self.dtype}" ) return result - def __len__(self): + def __len__(self) -> int: return len(self.values) def __getstate__(self): @@ -338,7 +326,7 @@ def ftype(self): dtype = self.dtype.subtype else: dtype = self.dtype - return "{dtype}:{ftype}".format(dtype=dtype, ftype=self._ftype) + return f"{dtype}:{self._ftype}" def merge(self, other): return _merge_blocks([self, other]) @@ -380,6 +368,17 @@ def apply(self, func, **kwargs): """ with np.errstate(all="ignore"): result = func(self.values, **kwargs) + + if is_extension_array_dtype(result) and result.ndim > 1: + # if we get a 2D ExtensionArray, we need to split it into 1D pieces + nbs = [] + for i, loc in enumerate(self.mgr_locs): + vals = result[i] + nv = _block_shape(vals, ndim=self.ndim) + block = self.make_block(values=nv, placement=[loc]) + nbs.append(block) + return nbs + if not isinstance(result, Block): result = self.make_block(values=_block_shape(result, ndim=self.ndim)) @@ -393,10 +392,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): mask = isna(self.values) if limit is not None: - if not is_integer(limit): - raise ValueError("Limit must be an integer") - if limit < 1: - raise ValueError("Limit must be greater than 0") + limit = libalgos._validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -535,16 +531,14 @@ def f(mask, val, idx): return self.split_and_operate(None, f, False) - def astype(self, dtype, copy=False, errors="raise", **kwargs): - return self._astype(dtype, copy=copy, errors=errors, **kwargs) - - def _astype(self, dtype, copy=False, errors="raise", **kwargs): - """Coerce to the new type + def astype(self, dtype, copy: bool = False, errors: str = "raise"): + """ + Coerce to the new dtype. Parameters ---------- dtype : str, dtype convertible - copy : boolean, default False + copy : bool, default False copy if indicated errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised @@ -558,15 +552,15 @@ def _astype(self, dtype, copy=False, errors="raise", **kwargs): if errors not in errors_legal_values: invalid_arg = ( - "Expected value of kwarg 'errors' to be one of {}. " - "Supplied value is '{}'".format(list(errors_legal_values), errors) + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" ) raise ValueError(invalid_arg) if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): msg = ( - "Expected an instance of {}, but got the class instead. " - "Try instantiating 'dtype'.".format(dtype.__name__) + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." ) raise TypeError(msg) @@ -627,15 +621,9 @@ def _astype(self, dtype, copy=False, errors="raise", **kwargs): if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: raise TypeError( - "cannot set astype for copy = [{copy}] for dtype " - "({dtype} [{shape}]) to different shape " - "({newb_dtype} [{newb_shape}])".format( - copy=copy, - dtype=self.dtype.name, - shape=self.shape, - newb_dtype=newb.dtype.name, - newb_shape=newb.shape, - ) + f"cannot set astype for copy = [{copy}] for dtype " + f"({self.dtype.name} [{self.shape}]) to different shape " + f"({newb.dtype.name} [{newb.shape}])" ) return newb @@ -669,10 +657,10 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): if slicer is not None: values = values[:, slicer] mask = isna(values) + itemsize = writers.word_len(na_rep) - if not self.is_object and not quoting: - itemsize = writers.word_len(na_rep) - values = values.astype(" bool: tipo = maybe_infer_dtype_type(element) @@ -2326,7 +2312,7 @@ def _slice(self, slicer): if isinstance(slicer, tuple): col, loc = slicer if not com.is_null_slice(col) and col != 0: - raise IndexError("{0} only contains one item".format(self)) + raise IndexError(f"{self} only contains one item") return self.values[loc] return self.values[slicer] @@ -2456,15 +2442,11 @@ def fillna(self, value, **kwargs): # interpreted as nanoseconds if is_integer(value): # Deprecation GH#24694, GH#19233 - warnings.warn( - "Passing integers to fillna is deprecated, will " - "raise a TypeError in a future version. To retain " - "the old behavior, pass pd.Timedelta(seconds=n) " - "instead.", - FutureWarning, - stacklevel=6, + raise TypeError( + "Passing integers to fillna for timedelta64[ns] dtype is no " + "longer supported. To obtain the old behavior, pass " + "`pd.Timedelta(seconds=n)` instead." ) - value = Timedelta(value, unit="s") return super().fillna(value, **kwargs) def should_store(self, value): @@ -2608,10 +2590,6 @@ def should_store(self, value): value.dtype.type, (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), ) - or - # TODO(ExtensionArray): remove is_extension_type - # when all extension arrays have been ported. - is_extension_type(value) or is_extension_array_dtype(value) ) @@ -2833,6 +2811,8 @@ def _replace_coerce( if convert: block = [b.convert(numeric=False, copy=True) for b in block] return block + if convert: + return [self.convert(numeric=False, copy=True)] return self @@ -2901,36 +2881,29 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1), ndim=self.ndim ) - def where( + def replace( self, - other, - cond, - align=True, - errors="raise", - try_cast: bool = False, - axis: int = 0, - ) -> List["Block"]: - # TODO(CategoricalBlock.where): - # This can all be deleted in favor of ExtensionBlock.where once - # we enforce the deprecation. - object_msg = ( - "Implicitly converting categorical to object-dtype ndarray. " - "One or more of the values in 'other' are not present in this " - "categorical's categories. A future version of pandas will raise " - "a ValueError when 'other' contains different categories.\n\n" - "To preserve the current behavior, add the new categories to " - "the categorical before calling 'where', or convert the " - "categorical to a different dtype." - ) - try: - # Attempt to do preserve categorical dtype. - result = super().where(other, cond, align, errors, try_cast, axis) - except (TypeError, ValueError): - warnings.warn(object_msg, FutureWarning, stacklevel=6) - result = self.astype(object).where( - other, cond, align=align, errors=errors, try_cast=try_cast, axis=axis + to_replace, + value, + inplace: bool = False, + filter=None, + regex: bool = False, + convert: bool = True, + ): + inplace = validate_bool_kwarg(inplace, "inplace") + result = self if inplace else self.copy() + if filter is None: # replace was called on a series + result.values.replace(to_replace, value, inplace=True) + if convert: + return result.convert(numeric=False, copy=not inplace) + else: + return result + else: # replace was called on a DataFrame + if not isna(value): + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace( + to_replace, value, inplace, filter, regex, convert ) - return result # ----------------------------------------------------------------- @@ -2983,7 +2956,7 @@ def get_block_type(values, dtype=None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None): +def make_block(values, placement, klass=None, ndim=None, dtype=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -2994,12 +2967,6 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=No if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype - if fastpath is not None: - # GH#19265 pyarrow is passing this - warnings.warn( - "fastpath argument is deprecated, will be removed in a future release.", - FutureWarning, - ) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) @@ -3016,7 +2983,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=No def _extend_blocks(result, blocks=None): - """ return a new extended blocks, givin the result """ + """ return a new extended blocks, given the result """ from pandas.core.internals import BlockManager if blocks is None: @@ -3057,7 +3024,6 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): if dtype is None: if len({b.dtype for b in blocks}) != 1: raise AssertionError("_merge_blocks are invalid!") - dtype = blocks[0].dtype # FIXME: optimization potential in case all mgrs contain slices and # combination of those slices is a slice, too. @@ -3171,7 +3137,7 @@ def _putmask_preserve(nv, n): # change the dtype if needed dtype, _ = maybe_promote(n.dtype) - if is_extension_type(v.dtype) and is_object_dtype(dtype): + if is_extension_array_dtype(v.dtype) and is_object_dtype(dtype): v = v._internal_get_values(dtype) else: v = v.astype(dtype) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 36e1b06230d7e..c6f30ef65e9d5 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -120,10 +120,8 @@ def __init__(self, block, shape, indexers=None): self.indexers = indexers self.shape = shape - def __repr__(self): - return "{name}({block!r}, {indexers})".format( - name=self.__class__.__name__, block=self.block, indexers=self.indexers - ) + def __repr__(self) -> str: + return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" @cache_readonly def needs_filling(self): @@ -197,7 +195,6 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return array( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) - pass elif getattr(self.block, "is_categorical", False): pass elif getattr(self.block, "is_extension", False): @@ -244,7 +241,7 @@ def concatenate_join_units(join_units, concat_axis, copy): # Concatenating join units along ax0 is handled in _merge_blocks. raise AssertionError("Concatenating join units along axis0") - empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + empty_dtype, upcasted_na = _get_empty_dtype_and_na(join_units) to_concat = [ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) @@ -268,7 +265,7 @@ def concatenate_join_units(join_units, concat_axis, copy): return concat_values -def get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -284,7 +281,7 @@ def get_empty_dtype_and_na(join_units): if blk is None: return np.float64, np.nan - if is_uniform_reindex(join_units): + if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value @@ -398,7 +395,7 @@ def is_uniform_join_units(join_units): ) -def is_uniform_reindex(join_units): +def _is_uniform_reindex(join_units) -> bool: return ( # TODO: should this be ju.block._can_hold_na? all(ju.block and ju.block.is_extension for ju in join_units) @@ -406,7 +403,7 @@ def is_uniform_reindex(join_units): ) -def trim_join_unit(join_unit, length): +def _trim_join_unit(join_unit, length): """ Reduce join_unit's shape along item axis to length. @@ -486,9 +483,9 @@ def _next_or_none(seq): for i, (plc, unit) in enumerate(next_items): yielded_units[i] = unit if len(plc) > min_len: - # trim_join_unit updates unit in place, so only + # _trim_join_unit updates unit in place, so only # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], trim_join_unit(unit, min_len)) + next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len)) else: yielded_placement = plc next_items[i] = _next_or_none(plans[i]) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 176f4acd113fe..3a92cfd9bf16d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -2,14 +2,12 @@ Functions for preparing various inputs passed to the DataFrame or Series constructors before passing them to a BlockManager. """ -from collections import OrderedDict, abc +from collections import abc import numpy as np import numpy.ma as ma from pandas._libs import lib -import pandas.compat as compat -from pandas.compat import PY36 from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -39,13 +37,13 @@ from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical from pandas.core.construction import sanitize_array -from pandas.core.index import ( +from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import ( Index, - _get_objs_combined_axis, - _union_indexes, ensure_index, + get_objs_combined_axis, + union_indexes, ) -from pandas.core.indexes import base as ibase from pandas.core.internals import ( create_block_manager_from_arrays, create_block_manager_from_blocks, @@ -97,6 +95,9 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): + # TODO: numpy docs suggest fv must be scalar, but could it be + # non-scalar for object dtype? + assert lib.is_scalar(fv), fv mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) @@ -149,11 +150,19 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): index, columns = _get_axes(len(values), 1, index, columns) return arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif is_extension_array_dtype(values): + elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 + + if isinstance(values, np.ndarray) and values.ndim > 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + values = [values[:, n] for n in range(values.shape[1])] + else: + values = [values] + if columns is None: - columns = [0] - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + columns = list(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -164,9 +173,9 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): try: values = values.astype(dtype) except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 raise ValueError( - "failed to cast to '{dtype}' (Exception " - "was: {orig})".format(dtype=dtype, orig=orig) + f"failed to cast to '{dtype}' (Exception was: {orig})" ) from orig index, columns = _get_axes(*values.shape, index=index, columns=columns) @@ -231,7 +240,7 @@ def init_dict(data, index, columns, dtype=None): arrays.loc[missing] = [val] * missing.sum() else: - keys = com.dict_keys_to_ordered_list(data) + keys = list(data.keys()) columns = data_names = Index(keys) arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case @@ -248,10 +257,13 @@ def init_dict(data, index, columns, dtype=None): # --------------------------------------------------------------------- -def prep_ndarray(values, copy=True): +def prep_ndarray(values, copy=True) -> np.ndarray: if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) + elif isinstance(values, range): + arr = np.arange(values.start, values.stop, values.step, dtype="int64") + return arr[..., np.newaxis] def convert(v): return maybe_convert_platform(v) @@ -327,7 +339,6 @@ def extract_index(data): have_raw_arrays = False have_series = False have_dicts = False - have_ordered = False for val in data: if isinstance(val, ABCSeries): @@ -335,8 +346,6 @@ def extract_index(data): indexes.append(val.index) elif isinstance(val, dict): have_dicts = True - if isinstance(val, OrderedDict): - have_ordered = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True @@ -346,9 +355,9 @@ def extract_index(data): raise ValueError("If using all scalar values, you must pass an index") if have_series: - index = _union_indexes(indexes) + index = union_indexes(indexes) elif have_dicts: - index = _union_indexes(indexes, sort=not (compat.PY36 or have_ordered)) + index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) @@ -363,8 +372,8 @@ def extract_index(data): if have_series: if lengths[0] != len(index): msg = ( - "array length {length} does not match index " - "length {idx_len}".format(length=lengths[0], idx_len=len(index)) + f"array length {lengths[0]} does not match index " + f"length {len(index)}" ) raise ValueError(msg) else: @@ -399,7 +408,7 @@ def get_names_from_index(data): if n is not None: index[i] = n else: - index[i] = "Unnamed {count}".format(count=count) + index[i] = f"Unnamed {count}" count += 1 return index @@ -494,7 +503,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: - columns = _get_objs_combined_axis(data, sort=False) + # We know pass_data is non-empty because data[0] is a Series + pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] + columns = get_objs_combined_axis(pass_data, sort=False) indexer_cache = {} @@ -527,7 +538,7 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): """Convert list of dicts to numpy arrays if `columns` is not passed, column names are inferred from the records - - for OrderedDict and (on Python>=3.6) dicts, the column names match + - for OrderedDict and dicts, the column names match the key insertion-order from the first record to the last. - For other kinds of dict-likes, the keys are lexically sorted. @@ -544,10 +555,10 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): tuple arrays, columns """ + if columns is None: gen = (list(x.keys()) for x in data) - types = (dict, OrderedDict) if PY36 else OrderedDict - sort = not any(isinstance(d, types) for d in data) + sort = not any(isinstance(d, dict) for d in data) columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived @@ -567,8 +578,8 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): if len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... raise AssertionError( - "{col:d} columns passed, passed data had " - "{con} columns".format(col=len(columns), con=len(content)) + f"{len(columns)} columns passed, passed data had " + f"{len(content)} columns" ) # provide soft conversion of object dtypes diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c47aaf7c773c4..066689b3e374e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -20,7 +20,6 @@ _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, - is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar, @@ -33,12 +32,9 @@ import pandas.core.algorithms as algos from pandas.core.base import PandasObject -from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.indexers import maybe_convert_indices - -from pandas.io.formats.printing import pprint_thing - -from .blocks import ( +from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.internals.blocks import ( Block, CategoricalBlock, DatetimeTZBlock, @@ -50,13 +46,15 @@ get_block_type, make_block, ) -from .concat import ( # all for concatenate_block_managers +from pandas.core.internals.concat import ( # all for concatenate_block_managers combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, is_uniform_join_units, ) +from pandas.io.formats.printing import pprint_thing + # TODO: flexible with index=None and/or items=None @@ -82,9 +80,7 @@ class BlockManager(PandasObject): copy(deep=True) get_dtype_counts - get_ftype_counts get_dtypes - get_ftypes apply(func, axes, block_filter_fn) @@ -130,13 +126,13 @@ def __init__( do_integrity_check: bool = True, ): self.axes = [ensure_index(ax) for ax in axes] - self.blocks = tuple(blocks) # type: Tuple[Block, ...] + self.blocks: Tuple[Block, ...] = tuple(blocks) for block in blocks: if self.ndim != block.ndim: raise AssertionError( - "Number of Block dimensions ({block}) must equal " - "number of axes ({self})".format(block=block.ndim, self=self.ndim) + f"Number of Block dimensions ({block.ndim}) must equal " + f"number of axes ({self.ndim})" ) if do_integrity_check: @@ -156,7 +152,7 @@ def make_empty(self, axes=None): blocks = np.array([], dtype=self.array_dtype) else: blocks = [] - return self.__class__(blocks, axes) + return type(self)(blocks, axes) def __nonzero__(self): return True @@ -169,7 +165,7 @@ def shape(self): return tuple(len(ax) for ax in self.axes) @property - def ndim(self): + def ndim(self) -> int: return len(self.axes) def set_axis(self, axis, new_labels): @@ -179,8 +175,8 @@ def set_axis(self, axis, new_labels): if new_len != old_len: raise ValueError( - "Length mismatch: Expected axis has {old} elements, new " - "values have {new} elements".format(old=old_len, new=new_len) + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" ) self.axes[axis] = new_labels @@ -249,21 +245,14 @@ def _get_counts(self, f): def get_dtype_counts(self): return self._get_counts(lambda b: b.dtype.name) - def get_ftype_counts(self): - return self._get_counts(lambda b: b.ftype) - def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return algos.take_1d(dtypes, self._blknos, allow_fill=False) - def get_ftypes(self): - ftypes = np.array([blk.ftype for blk in self.blocks]) - return algos.take_1d(ftypes, self._blknos, allow_fill=False) - def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] - axes_array = [ax for ax in self.axes] + axes_array = list(self.axes) extra_state = { "0.14.1": { @@ -322,19 +311,19 @@ def _post_setstate(self): self._known_consolidated = False self._rebuild_blknos_and_blklocs() - def __len__(self): + def __len__(self) -> int: return len(self.items) - def __repr__(self): - output = pprint_thing(self.__class__.__name__) + def __repr__(self) -> str: + output = type(self).__name__ for i, ax in enumerate(self.axes): if i == 0: - output += "\nItems: {ax}".format(ax=ax) + output += f"\nItems: {ax}" else: - output += "\nAxis {i}: {ax}".format(i=i, ax=ax) + output += f"\nAxis {i}: {ax}" for block in self.blocks: - output += "\n{block}".format(block=pprint_thing(block)) + output += f"\n{pprint_thing(block)}" return output def _verify_integrity(self): @@ -346,37 +335,50 @@ def _verify_integrity(self): if len(self.items) != tot_items: raise AssertionError( "Number of manager items must equal union of " - "block items\n# manager items: {0}, # " - "tot_items: {1}".format(len(self.items), tot_items) + f"block items\n# manager items: {len(self.items)}, # " + f"tot_items: {tot_items}" ) - def apply( - self, - f, - axes=None, - filter=None, - do_integrity_check=False, - consolidate=True, - **kwargs - ): + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + + def apply(self, f, filter=None, **kwargs): """ - iterate over the blocks, collect and create a new block manager + Iterate over the blocks, collect and create a new BlockManager. Parameters ---------- - f : the callable or function name to operate on at the block level - axes : optional (if not supplied, use self.axes) + f : str or callable + Name of the Block method to apply. filter : list, if supplied, only call the block if the filter is in the block - do_integrity_check : boolean, default False. Do the block manager - integrity check - consolidate: boolean, default True. Join together blocks having same - dtype Returns ------- - Block Manager (new object) - + BlockManager """ result_blocks = [] @@ -390,8 +392,7 @@ def apply( else: kwargs["filter"] = filter_locs - if consolidate: - self._consolidate_inplace() + self._consolidate_inplace() if f == "where": align_copy = True @@ -435,15 +436,15 @@ def apply( axis = obj._info_axis_number kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) - applied = getattr(b, f)(**kwargs) + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: - return self.make_empty(axes or self.axes) - bm = self.__class__( - result_blocks, axes or self.axes, do_integrity_check=do_integrity_check - ) - bm._consolidate_inplace() + return self.make_empty(self.axes) + bm = type(self)(result_blocks, self.axes, do_integrity_check=False) return bm def quantile( @@ -529,7 +530,7 @@ def get_axe(block, qs, axes): for b in blocks ] - return self.__class__(blocks, new_axes) + return type(self)(blocks, new_axes) # single block, i.e. ndim == {1} values = concat_compat([b.values for b in blocks]) @@ -550,8 +551,8 @@ def get_axe(block, qs, axes): [make_block(values, ndim=1, placement=np.arange(len(values)))], axes[0] ) - def isna(self, func, **kwargs): - return self.apply("apply", func=func, **kwargs) + def isna(self, func): + return self.apply("apply", func=func) def where(self, **kwargs): return self.apply("where", **kwargs) @@ -577,8 +578,8 @@ def fillna(self, **kwargs): def downcast(self, **kwargs): return self.apply("downcast", **kwargs) - def astype(self, dtype, **kwargs): - return self.apply("astype", dtype=dtype, **kwargs) + def astype(self, dtype, copy: bool = False, errors: str = "raise"): + return self.apply("astype", dtype=dtype, copy=copy, errors=errors) def convert(self, **kwargs): return self.apply("convert", **kwargs) @@ -632,14 +633,14 @@ def comp(s, regex=False): convert=convert, regex=regex, ) - if m.any(): + if m.any() or convert: new_rb = _extend_blocks(result, new_rb) else: new_rb.append(b) rb = new_rb result_blocks.extend(rb) - bm = self.__class__(result_blocks, self.axes) + bm = type(self)(result_blocks, self.axes) bm._consolidate_inplace() return bm @@ -734,32 +735,32 @@ def combine(self, blocks, copy=True): axes = list(self.axes) axes[0] = self.items.take(indexer) - return self.__class__(new_blocks, axes, do_integrity_check=False) + return type(self)(new_blocks, axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj: slice, axis: int = 0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) else: - slicer = [slice(None)] * (axis + 1) - slicer[axis] = slobj - slicer = tuple(slicer) + _slicer = [slice(None)] * (axis + 1) + _slicer[axis] = slobj + slicer = tuple(_slicer) new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] new_axes = list(self.axes) new_axes[axis] = new_axes[axis][slobj] - bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) + bm = type(self)(new_blocks, new_axes, do_integrity_check=False) bm._consolidate_inplace() return bm - def __contains__(self, item): + def __contains__(self, item) -> bool: return item in self.items @property - def nblocks(self): + def nblocks(self) -> int: return len(self.blocks) def copy(self, deep=True): @@ -768,24 +769,31 @@ def copy(self, deep=True): Parameters ---------- - deep : boolean o rstring, default True + deep : bool or string, default True If False, return shallow copy (do not copy data) If 'all', copy data and a deep copy of the index Returns ------- - copy : BlockManager + BlockManager """ # this preserves the notion of view copying of axes if deep: - if deep == "all": - copy = lambda ax: ax.copy(deep=True) - else: - copy = lambda ax: ax.view() - new_axes = [copy(ax) for ax in self.axes] + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + if deep == "all": + return ax.copy(deep=True) + else: + return ax.view() + + new_axes = [copy_func(ax) for ax in self.axes] else: new_axes = list(self.axes) - return self.apply("copy", axes=new_axes, deep=deep, do_integrity_check=False) + + res = self.apply("copy", deep=deep) + res.axes = new_axes + return res def as_array(self, transpose=False, items=None): """Convert the blockmanager data into an numpy array. @@ -927,7 +935,7 @@ def consolidate(self): if self.is_consolidated(): return self - bm = self.__class__(self.blocks, self.axes) + bm = type(self)(self.blocks, self.axes) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -1034,11 +1042,7 @@ def set(self, item, value): # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical - # TODO(EA): Remove an is_extension_ when all extension types satisfy - # the interface - value_is_extension_type = is_extension_type(value) or is_extension_array_dtype( - value - ) + value_is_extension_type = is_extension_array_dtype(value) # categorical/sparse/datetimetz if value_is_extension_type: @@ -1166,7 +1170,7 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? - raise ValueError("cannot insert {}, already exists".format(item)) + raise ValueError(f"cannot insert {item}, already exists") if not isinstance(loc, int): raise TypeError("loc must be int") @@ -1265,7 +1269,7 @@ def reindex_indexer( new_axes = list(self.axes) new_axes[axis] = new_axis - return self.__class__(new_blocks, new_axes) + return type(self)(new_blocks, new_axes) def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): """ @@ -1276,7 +1280,6 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): Returns ------- new_blocks : list of Block - """ allow_fill = fill_tuple is not None @@ -1402,12 +1405,12 @@ def equals(self, other): if len(self.blocks) != len(other.blocks): return False - # canonicalize block order, using a tuple combining the type - # name and then mgr_locs because there might be unconsolidated + # canonicalize block order, using a tuple combining the mgr_locs + # then type name because there might be unconsolidated # blocks (say, Categorical) which can only be distinguished by # the iteration order def canonicalize(block): - return (block.dtype.name, block.mgr_locs.as_array.tolist()) + return (block.mgr_locs.as_array.tolist(), block.dtype.name) self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) @@ -1536,18 +1539,12 @@ def get_slice(self, slobj, axis=0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return self.__class__( - self._block._slice(slobj), self.index[slobj], fastpath=True - ) + return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True) @property def index(self): return self.axes[0] - def convert(self, **kwargs): - """ convert the whole block as one """ - return self.apply("convert", **kwargs) - @property def dtype(self): return self._block.dtype @@ -1556,22 +1553,12 @@ def dtype(self): def array_dtype(self): return self._block.array_dtype - @property - def ftype(self): - return self._block.ftype - def get_dtype_counts(self): return {self.dtype.name: 1} - def get_ftype_counts(self): - return {self.ftype: 1} - def get_dtypes(self): return np.array([self._block.dtype]) - def get_ftypes(self): - return np.array([self._block.ftype]) - def external_values(self): return self._block.external_values() @@ -1704,9 +1691,7 @@ def construction_error(tot_items, block_shape, axes, e=None): raise e if block_shape[0] == 0: raise ValueError("Empty data passed with indices specified.") - raise ValueError( - "Shape of passed values is {0}, indices imply {1}".format(passed, implied) - ) + raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") # ----------------------------------------------------------------------- @@ -1868,7 +1853,7 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block] + blocks: List[Block], ) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. @@ -1932,12 +1917,8 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) - # numpy deprecation warning to have i8 vs integer comparisons - if is_datetimelike_v_numeric(a, b): - result = False - - # numpy deprecation warning if comparing numeric vs string-like - elif is_numeric_v_string_like(a, b): + if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy result = False else: result = op(a) @@ -1946,15 +1927,13 @@ def _compare_or_regex_search(a, b, regex=False): type_names = [type(a).__name__, type(b).__name__] if is_a_array: - type_names[0] = "ndarray(dtype={dtype})".format(dtype=a.dtype) + type_names[0] = f"ndarray(dtype={a.dtype})" if is_b_array: - type_names[1] = "ndarray(dtype={dtype})".format(dtype=b.dtype) + type_names[1] = f"ndarray(dtype={b.dtype})" raise TypeError( - "Cannot compare types {a!r} and {b!r}".format( - a=type_names[0], b=type_names[1] - ) + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" ) return result diff --git a/pandas/core/missing.py b/pandas/core/missing.py index f2655c126b9e5..b30a7a24f3495 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,6 +1,7 @@ """ Routines for filling missing data. """ + import numpy as np from pandas._libs import algos, lib @@ -11,7 +12,6 @@ ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, - is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, @@ -40,9 +40,8 @@ def mask_missing(arr, values_to_mask): mask = None for x in nonna: if mask is None: - - # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings mask = False else: mask = arr == x @@ -52,9 +51,8 @@ def mask_missing(arr, values_to_mask): if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: - - # numpy elementwise comparison warning if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings mask |= False else: mask |= arr == x @@ -90,10 +88,7 @@ def clean_fill_method(method, allow_nearest=False): valid_methods.append("nearest") expecting = "pad (ffill), backfill (bfill) or nearest" if method not in valid_methods: - msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( - expecting=expecting, method=method - ) - raise ValueError(msg) + raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}") return method @@ -121,14 +116,48 @@ def clean_interp_method(method, **kwargs): if method in ("spline", "polynomial") and order is None: raise ValueError("You must specify the order of the spline or polynomial.") if method not in valid: - raise ValueError( - "method must be one of {valid}. Got '{method}' " - "instead.".format(valid=valid, method=method) - ) + raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") return method +def find_valid_index(values, how: str): + """ + Retrieves the index of the first valid value. + + Parameters + ---------- + values : ndarray or ExtensionArray + how : {'first', 'last'} + Use this parameter to change between the first or last valid index. + + Returns + ------- + int or None + """ + assert how in ["first", "last"] + + if len(values) == 0: # early stop + return None + + is_valid = ~isna(values) + + if values.ndim == 2: + is_valid = is_valid.any(1) # reduce axis 1 + + if how == "first": + idxpos = is_valid[::].argmax() + + if how == "last": + idxpos = len(values) - 1 - is_valid[::-1].argmax() + + chk_notna = is_valid[idxpos] + + if not chk_notna: + return None + return idxpos + + def interpolate_1d( xvalues, yvalues, @@ -139,7 +168,7 @@ def interpolate_1d( fill_value=None, bounds_error=False, order=None, - **kwargs + **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -176,9 +205,9 @@ def interpolate_1d( valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: - msg = "Invalid limit_direction: expecting one of {valid!r}, got {invalid!r}." raise ValueError( - msg.format(valid=valid_limit_directions, invalid=limit_direction) + "Invalid limit_direction: expecting one of " + f"{valid_limit_directions}, got '{limit_direction}'." ) if limit_area is not None: @@ -186,27 +215,17 @@ def interpolate_1d( limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError( - "Invalid limit_area: expecting one of {}, got " - "{}.".format(valid_limit_areas, limit_area) + f"Invalid limit_area: expecting one of {valid_limit_areas}, got " + f"{limit_area}." ) # default limit is unlimited GH #16282 - if limit is None: - # limit = len(xvalues) - pass - elif not is_integer(limit): - raise ValueError("Limit must be an integer") - elif limit < 1: - raise ValueError("Limit must be greater than 0") - - from pandas import Series - - ys = Series(yvalues) + limit = algos._validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(ys.first_valid_index())) - end_nans = set(range(1 + ys.last_valid_index(), len(valid))) + start_nans = set(range(find_valid_index(yvalues, "first"))) + end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, @@ -252,7 +271,11 @@ def interpolate_1d( inds = lib.maybe_convert_objects(inds) else: inds = xvalues - result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) + # np.interp requires sorted X values, #21037 + indexer = np.argsort(inds[valid]) + result[invalid] = np.interp( + inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + ) result[preserve_nans] = np.nan return result @@ -285,7 +308,7 @@ def interpolate_1d( fill_value=fill_value, bounds_error=bounds_error, order=order, - **kwargs + **kwargs, ) result[preserve_nans] = np.nan return result @@ -299,7 +322,7 @@ def _interpolate_scipy_wrapper( Returns an array interpolated at new_x. Add any new methods to the list in _clean_interp_method. """ - extra = "{method} interpolation requires SciPy.".format(method=method) + extra = f"{method} interpolation requires SciPy." import_optional_dependency("scipy", extra=extra) from scipy import interpolate @@ -314,7 +337,7 @@ def _interpolate_scipy_wrapper( } if getattr(x, "is_all_dates", False): - # GH 5975, scipy.interp1d can't hande datetime64s + # GH 5975, scipy.interp1d can't handle datetime64s x, new_x = x._values.astype("i8"), new_x.astype("i8") if method == "pchip": @@ -346,8 +369,7 @@ def _interpolate_scipy_wrapper( # GH #10633, #24014 if isna(order) or (order <= 0): raise ValueError( - "order needs to be specified and greater than 0; " - "got order: {}".format(order) + f"order needs to be specified and greater than 0; got order: {order}" ) terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) new_y = terp(new_x) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b9267db76e1a8..6b03e76a1d691 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -16,7 +16,6 @@ is_any_int_dtype, is_bool_dtype, is_complex, - is_complex_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, @@ -61,8 +60,10 @@ def __call__(self, f): def _f(*args, **kwargs): obj_iter = itertools.chain(args, kwargs.values()) if any(self.check(obj) for obj in obj_iter): - msg = "reduction operation {name!r} not allowed for this dtype" - raise TypeError(msg.format(name=f.__name__.replace("nan", ""))) + f_name = f.__name__.replace("nan", "") + raise TypeError( + f"reduction operation '{f_name}' not allowed for this dtype" + ) try: with np.errstate(invalid="ignore"): return f(*args, **kwargs) @@ -273,6 +274,12 @@ def _get_values( fill_value : Any fill value used """ + + # In _get_values is only called from within nanops, and in all cases + # with scalar fill_value. This guarantee is important for the + # maybe_upcast_putmask call below + assert is_scalar(fill_value) + mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): @@ -307,7 +314,7 @@ def _get_values( # promote if needed else: - values, changed = maybe_upcast_putmask(values, mask, fill_value) + values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype @@ -319,19 +326,6 @@ def _get_values( return values, mask, dtype, dtype_max, fill_value -def _isfinite(values): - if is_datetime_or_timedelta_dtype(values): - return isna(values) - if ( - is_complex_dtype(values) - or is_float_dtype(values) - or is_integer_dtype(values) - or is_bool_dtype(values) - ): - return ~np.isfinite(values) - return ~np.isfinite(values.astype("float64")) - - def _na_ok_dtype(dtype): # TODO: what about datetime64tz? PeriodDtype? return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) @@ -668,7 +662,7 @@ def _get_counts_nanvar( count = np.nan d = np.nan else: - mask2 = count <= ddof # type: np.ndarray + mask2: np.ndarray = count <= ddof if mask2.any(): np.putmask(d, mask2, np.nan) np.putmask(count, mask2, np.nan) @@ -705,11 +699,14 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ + orig_dtype = values.dtype + values, mask, dtype, dtype_max, fill_value = _get_values(values, skipna, mask=mask) + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) - return _wrap_results(result, values.dtype) + return _wrap_results(result, orig_dtype) -@disallow("M8") +@disallow("M8", "m8") @bottleneck_switch(ddof=1) def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -834,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) @@ -1246,17 +1243,22 @@ def nancorr(a, b, method="pearson", min_periods=None): def get_corr_func(method): if method in ["kendall", "spearman"]: from scipy.stats import kendalltau, spearmanr + elif method in ["pearson"]: + pass elif callable(method): return method + else: + raise ValueError( + f"Unkown method '{method}', expected one of 'kendall', 'spearman'" + ) def _pearson(a, b): return np.corrcoef(a, b)[0, 1] def _kendall(a, b): + # kendallttau returns a tuple of the tau statistic and pvalue rs = kendalltau(a, b) - if isinstance(rs, tuple): - return rs[0] - return rs + return rs[0] def _spearman(a, b): return spearmanr(a, b)[0] @@ -1305,9 +1307,7 @@ def _ensure_numeric(x): x = complex(x) except ValueError: # e.g. "foo" - raise TypeError( - "Could not convert {value!s} to numeric".format(value=x) - ) + raise TypeError(f"Could not convert {x} to numeric") return x @@ -1343,7 +1343,7 @@ def f(x, y): def _nanpercentile_1d(values, mask, q, na_value, interpolation): """ - Wraper for np.percentile that skips missing values, specialized to + Wrapper for np.percentile that skips missing values, specialized to 1-dimensional case. Parameters @@ -1374,7 +1374,7 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation): def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): """ - Wraper for np.percentile that skips missing values. + Wrapper for np.percentile that skips missing values. Parameters ---------- diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 398fa9b0c1fc0..f51d71d5507a0 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,11 +5,12 @@ """ import datetime import operator -from typing import Tuple, Union +from typing import Set, Tuple, Union import numpy as np from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype @@ -26,10 +27,11 @@ arithmetic_op, comparison_op, define_na_arithmetic_op, + get_array_op, logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 -from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( _arith_doc_FRAME, @@ -38,6 +40,7 @@ _op_descriptions, ) from pandas.core.ops.invalid import invalid_comparison # noqa:F401 +from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 from pandas.core.ops.methods import ( # noqa:F401 add_flex_arithmetic_methods, add_special_arithmetic_methods, @@ -57,6 +60,37 @@ rxor, ) +# ----------------------------------------------------------------------------- +# constants +ARITHMETIC_BINOPS: Set[str] = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "radd", + "rsub", + "rmul", + "rpow", + "rmod", + "rfloordiv", + "rtruediv", + "rdivmod", +} + + +COMPARISON_BINOPS: Set[str] = { + "eq", + "ne", + "lt", + "gt", + "le", + "ge", +} + # ----------------------------------------------------------------------------- # Ops Wrapping Utilities @@ -181,41 +215,6 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # ----------------------------------------------------------------------------- -def _gen_eval_kwargs(name): - """ - Find the keyword arguments to pass to numexpr for the given operation. - - Parameters - ---------- - name : str - - Returns - ------- - eval_kwargs : dict - - Examples - -------- - >>> _gen_eval_kwargs("__add__") - {} - - >>> _gen_eval_kwargs("rtruediv") - {'reversed': True, 'truediv': True} - """ - kwargs = {} - - # Series appear to only pass __add__, __radd__, ... - # but DataFrame gets both these dunder names _and_ non-dunder names - # add, radd, ... - name = name.replace("__", "") - - if name.startswith("r"): - if name not in ["radd", "rand", "ror", "rxor"]: - # Exclude commutative operations - kwargs["reversed"] = True - - return kwargs - - def _get_frame_op_default_axis(name): """ Only DataFrame cares about default_axis, specifically: @@ -303,7 +302,7 @@ def _get_op_name(op, special): """ opname = op.__name__.strip("_") if special: - opname = "__{opname}__".format(opname=opname) + opname = f"__{opname}__" return opname @@ -374,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: - def column_op(a, b): - return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} + # Get the appropriate array-op to apply to each block's values. + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right) + return type(left)(bm) elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) @@ -384,7 +385,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": - # We only get here if called via _combine_frame_series, + # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) @@ -487,17 +488,16 @@ def _arith_method_SERIES(cls, op, special): """ str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) + @unpack_zerodim_and_defer(op_name) def wrapper(left, right): - if isinstance(right, ABCDataFrame): - return NotImplemented left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) lvalues = extract_array(left, extract_numpy=True) - result = arithmetic_op(lvalues, right, op, str_rep, eval_kwargs) + rvalues = extract_array(right, extract_numpy=True) + result = arithmetic_op(lvalues, rvalues, op, str_rep) return _construct_result(left, result, index=left.index, name=res_name) @@ -512,14 +512,11 @@ def _comp_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): # pragma: no cover - # Defer to DataFrame implementation; fail early - return NotImplemented - if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") @@ -541,14 +538,11 @@ def _bool_method_SERIES(cls, op, special): """ op_name = _get_op_name(op, special) + @unpack_zerodim_and_defer(op_name) def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) res_name = get_op_result_name(self, other) - if isinstance(other, ABCDataFrame): - # Defer to DataFrame implementation; fail early - return NotImplemented - lvalues = extract_array(self, extract_numpy=True) rvalues = extract_array(other, extract_numpy=True) @@ -609,9 +603,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=N result : DataFrame """ if fill_value is not None: - raise NotImplementedError( - "fill_value {fill} not supported.".format(fill=fill_value) - ) + raise NotImplementedError(f"fill_value {fill_value} not supported.") if axis is None: # default axis is columns @@ -667,15 +659,13 @@ def to_series(right): else: raise ValueError( "Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}".format( - req_shape=left.shape, given_shape=right.shape - ) + f"must be {left.shape}: given {right.shape}" ) elif right.ndim > 2: raise ValueError( "Unable to coerce to Series/DataFrame, dim " - "must be <= 2: {dim}".format(dim=right.shape) + f"must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -688,10 +678,9 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) - na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) + na_op = define_na_arithmetic_op(op, str_rep) is_logical = str_rep in ["&", "|", "^"] if op_name in _op_descriptions: @@ -709,7 +698,11 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # Another DataFrame pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op - return self._combine_frame(other, pass_op, fill_value, level) + + left, right = self.align(other, join="outer", level=level, copy=False) + new_data = left._combine_frame(right, pass_op, fill_value) + return left._construct_result(new_data) + elif isinstance(other, ABCSeries): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. @@ -723,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if fill_value is not None: self = self.fillna(fill_value) - new_data = dispatch_to_series(self, other, op) + new_data = dispatch_to_series(self, other, op, str_rep) return self._construct_result(new_data) f.__name__ = op_name @@ -770,7 +763,7 @@ def _comp_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - @Appender("Wrapper for comparison method {name}".format(name=op_name)) + @Appender(f"Wrapper for comparison method {op_name}") def f(self, other): other = _align_method_FRAME(self, other, axis=None) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 46c3b8b575af9..b84d468fff736 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,8 +2,9 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ +from functools import partial import operator -from typing import Any, Mapping, Union +from typing import Any, Optional, Union import numpy as np @@ -24,17 +25,14 @@ ) from pandas.core.dtypes.generic import ( ABCDatetimeArray, - ABCDatetimeIndex, ABCExtensionArray, ABCIndex, ABCIndexClass, ABCSeries, ABCTimedeltaArray, - ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna -from pandas.core.construction import extract_array from pandas.core.ops import missing from pandas.core.ops.dispatch import dispatch_to_extension_op, should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -54,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = libops.vec_compare(x, y, op) + result = libops.vec_compare(x.ravel(), y, op) else: - result = libops.scalar_compare(x, y, op) - return result + result = libops.scalar_compare(x.ravel(), y, op) + return result.reshape(x.shape) def masked_arith_op(x, y, op): @@ -113,19 +111,19 @@ def masked_arith_op(x, y, op): with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) - result, changed = maybe_upcast_putmask(result, ~mask, np.nan) + result, _ = maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) # 2D compat return result -def define_na_arithmetic_op(op, str_rep: str, eval_kwargs): +def define_na_arithmetic_op(op, str_rep: str): def na_op(x, y): - return na_arithmetic_op(x, y, op, str_rep, eval_kwargs) + return na_arithmetic_op(x, y, op, str_rep) return na_op -def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): +def na_arithmetic_op(left, right, op, str_rep: str): """ Return the result of evaluating op on the passed in values. @@ -136,7 +134,6 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): left : np.ndarray right : np.ndarray or scalar str_rep : str or None - eval_kwargs : kwargs to pass to expressions Returns ------- @@ -149,7 +146,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): import pandas.core.computation.expressions as expressions try: - result = expressions.evaluate(op, str_rep, left, right, **eval_kwargs) + result = expressions.evaluate(op, str_rep, left, right) except TypeError: result = masked_arith_op(left, right, op) @@ -157,11 +154,7 @@ def na_arithmetic_op(left, right, op, str_rep: str, eval_kwargs): def arithmetic_op( - left: Union[np.ndarray, ABCExtensionArray], - right: Any, - op, - str_rep: str, - eval_kwargs: Mapping[str, bool], + left: Union[np.ndarray, ABCExtensionArray], right: Any, op, str_rep: str ): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... @@ -183,22 +176,10 @@ def arithmetic_op( from pandas.core.ops import maybe_upcast_for_op - keep_null_freq = isinstance( - right, - ( - ABCDatetimeIndex, - ABCDatetimeArray, - ABCTimedeltaIndex, - ABCTimedeltaArray, - Timestamp, - ), - ) - - # NB: We assume that extract_array has already been called on `left`, but - # cannot make the same assumption about `right`. This is because we need - # to define `keep_null_freq` before calling extract_array on it. + # NB: We assume that extract_array has already been called + # on `left` and `right`. lvalues = left - rvalues = extract_array(right, extract_numpy=True) + rvalues = right rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) @@ -208,11 +189,11 @@ def arithmetic_op( # TimedeltaArray, DatetimeArray, and Timestamp are included here # because they have `freq` attribute which is handled correctly # by dispatch_to_extension_op. - res_values = dispatch_to_extension_op(op, lvalues, rvalues, keep_null_freq) + res_values = dispatch_to_extension_op(op, lvalues, rvalues) else: with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, eval_kwargs) + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep) return res_values @@ -257,15 +238,15 @@ def comparison_op( elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: - res_values = np.ones(len(lvalues), dtype=bool) + res_values = np.ones(lvalues.shape, dtype=bool) else: - res_values = np.zeros(len(lvalues), dtype=bool) + res_values = np.zeros(lvalues.shape, dtype=bool) elif is_object_dtype(lvalues.dtype): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - op_name = "__{op}__".format(op=op.__name__) + op_name = f"__{op.__name__}__" method = getattr(lvalues, op_name) with np.errstate(all="ignore"): res_values = method(rvalues) @@ -273,9 +254,8 @@ def comparison_op( if res_values is NotImplemented: res_values = invalid_comparison(lvalues, rvalues, op) if is_scalar(res_values): - raise TypeError( - "Could not compare {typ} type with Series".format(typ=type(rvalues)) - ) + typ = type(rvalues) + raise TypeError(f"Could not compare {typ} type with Series") return res_values @@ -312,11 +292,10 @@ def na_logical_op(x: np.ndarray, y, op): OverflowError, NotImplementedError, ): + typ = type(y).__name__ raise TypeError( - "Cannot perform '{op}' with a dtyped [{dtype}] array " - "and scalar of type [{typ}]".format( - op=op.__name__, dtype=x.dtype, typ=type(y).__name__ - ) + f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " + f"and scalar of type [{typ}]" ) return result @@ -387,3 +366,27 @@ def fill_bool(x, left=None): res_values = filler(res_values) # type: ignore return res_values + + +def get_array_op(op, str_rep: Optional[str] = None): + """ + Return a binary array operation corresponding to the given operator op. + + Parameters + ---------- + op : function + Binary operator from operator or roperator module. + str_rep : str or None, default None + str_rep to pass to arithmetic_op + + Returns + ------- + function + """ + op_name = op.__name__.strip("_") + if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: + return partial(comparison_op, op=op) + elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: + return partial(logical_op, op=op) + else: + return partial(arithmetic_op, op=op, str_rep=str_rep) diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py new file mode 100644 index 0000000000000..f4b16cf4a0cf2 --- /dev/null +++ b/pandas/core/ops/common.py @@ -0,0 +1,66 @@ +""" +Boilerplate functions used in defining binary operations. +""" +from functools import wraps + +from pandas._libs.lib import item_from_zerodim + +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries + + +def unpack_zerodim_and_defer(name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Parameters + ---------- + name : str + + Returns + ------- + decorator + """ + + def wrapper(method): + return _unpack_zerodim_and_defer(method, name) + + return wrapper + + +def _unpack_zerodim_and_defer(method, name: str): + """ + Boilerplate for pandas conventions in arithmetic and comparison methods. + + Ensure method returns NotImplemented when operating against "senior" + classes. Ensure zero-dimensional ndarrays are always unpacked. + + Parameters + ---------- + method : binary method + name : str + + Returns + ------- + method + """ + + is_cmp = name.strip("__") in {"eq", "ne", "lt", "le", "gt", "ge"} + + @wraps(method) + def new_method(self, other): + + if is_cmp and isinstance(self, ABCIndexClass) and isinstance(other, ABCSeries): + # For comparison ops, Index does *not* defer to Series + pass + else: + for cls in [ABCDataFrame, ABCSeries, ABCIndexClass]: + if isinstance(self, cls): + break + if isinstance(other, cls): + return NotImplemented + + other = item_from_zerodim(other) + + return method(self, other) + + return new_method diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index c39f4d6d9698d..61a3032c7a02c 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,12 +1,10 @@ """ Functions for defining unary operations. """ -from typing import Any, Callable, Union +from typing import Any, Union import numpy as np -from pandas.errors import NullFrequencyError - from pandas.core.dtypes.common import ( is_datetime64_dtype, is_extension_array_dtype, @@ -17,7 +15,6 @@ ) from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries -from pandas._typing import ArrayLike from pandas.core.construction import array @@ -97,10 +94,7 @@ def should_series_dispatch(left, right, op): def dispatch_to_extension_op( - op, - left: Union[ABCExtensionArray, np.ndarray], - right: Any, - keep_null_freq: bool = False, + op, left: Union[ABCExtensionArray, np.ndarray], right: Any, ): """ Assume that left or right is a Series backed by an ExtensionArray, @@ -111,9 +105,6 @@ def dispatch_to_extension_op( op : binary operator left : ExtensionArray or np.ndarray right : object - keep_null_freq : bool, default False - Whether to re-raise a NullFrequencyError unchanged, as opposed to - catching and raising TypeError. Returns ------- @@ -131,103 +122,5 @@ def dispatch_to_extension_op( # The op calls will raise TypeError if the op is not defined # on the ExtensionArray - - try: - res_values = op(left, right) - except NullFrequencyError: - # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError - # on add/sub of integers (or int-like). We re-raise as a TypeError. - if keep_null_freq: - # TODO: remove keep_null_freq after Timestamp+int deprecation - # GH#22535 is enforced - raise - raise TypeError( - "incompatible type for a datetime/timedelta " - "operation [{name}]".format(name=op.__name__) - ) + res_values = op(left, right) return res_values - - -def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any -): - """ - Dispatch a ufunc to the equivalent dunder method. - - Parameters - ---------- - self : ArrayLike - The array whose dunder method we dispatch to - ufunc : Callable - A NumPy ufunc - method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} - inputs : ArrayLike - The input arrays. - kwargs : Any - The additional keyword arguments, e.g. ``out``. - - Returns - ------- - result : Any - The result of applying the ufunc - """ - # special has the ufuncs we dispatch to the dunder op on - special = { - "add", - "sub", - "mul", - "pow", - "mod", - "floordiv", - "truediv", - "divmod", - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "remainder", - "matmul", - } - aliases = { - "subtract": "sub", - "multiply": "mul", - "floor_divide": "floordiv", - "true_divide": "truediv", - "power": "pow", - "remainder": "mod", - "divide": "div", - "equal": "eq", - "not_equal": "ne", - "less": "lt", - "less_equal": "le", - "greater": "gt", - "greater_equal": "ge", - } - - # For op(., Array) -> Array.__r{op}__ - flipped = { - "lt": "__gt__", - "le": "__ge__", - "gt": "__lt__", - "ge": "__le__", - "eq": "__eq__", - "ne": "__ne__", - } - - op_name = ufunc.__name__ - op_name = aliases.get(op_name, op_name) - - def not_implemented(*args, **kwargs): - return NotImplemented - - if method == "__call__" and op_name in special and kwargs.get("out") is None: - if isinstance(inputs[0], type(self)): - name = "__{}__".format(op_name) - return getattr(self, name, not_implemented)(inputs[1]) - else: - name = flipped.get(op_name, "__r{}__".format(op_name)) - return getattr(self, name, not_implemented)(inputs[0]) - else: - return NotImplemented diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 5d3f9cd92aa1a..e3db65f11a332 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -233,7 +233,7 @@ def _make_flex_doc(op_name, typ): dtype: float64 """ -_op_descriptions = { +_op_descriptions: Dict[str, Dict[str, Optional[str]]] = { # Arithmetic Operators "add": { "op": "+", @@ -310,7 +310,7 @@ def _make_flex_doc(op_name, typ): "reverse": None, "series_examples": None, }, -} # type: Dict[str, Dict[str, Optional[str]]] +} _op_names = list(_op_descriptions.keys()) for key in _op_names: diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index 013ff7689b221..cc4a1f11edd2b 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -30,11 +30,8 @@ def invalid_comparison(left, right, op): elif op is operator.ne: res_values = np.ones(left.shape, dtype=bool) else: - raise TypeError( - "Invalid comparison between dtype={dtype} and {typ}".format( - dtype=left.dtype, typ=type(right).__name__ - ) - ) + typ = type(right).__name__ + raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}") return res_values @@ -52,10 +49,8 @@ def make_invalid_op(name: str): """ def invalid_op(self, other=None): - raise TypeError( - "cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__) - ) + typ = type(self).__name__ + raise TypeError(f"cannot perform {name} with this index type: {typ}") invalid_op.__name__ = name return invalid_op diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py new file mode 100644 index 0000000000000..8fb81faf313d7 --- /dev/null +++ b/pandas/core/ops/mask_ops.py @@ -0,0 +1,178 @@ +""" +Ops for masked arrays. +""" +from typing import Optional, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing + + +def kleene_or( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``or`` using Kleene logic. + + Values are NA where we have ``NA | NA`` or ``NA | False``. + ``NA | True`` is considered True. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical or, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_or(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + + raise_for_nan(right, method="or") + + if right is libmissing.NA: + result = left.copy() + else: + result = left | right + + if right_mask is not None: + # output is unknown where (False & NA), (NA & False), (NA & NA) + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = ( + (left_false & right_mask) + | (right_false & left_mask) + | (left_mask & right_mask) + ) + else: + if right is True: + mask = np.zeros_like(left_mask) + elif right is libmissing.NA: + mask = (~left & ~left_mask) | left_mask + else: + # False + mask = left_mask.copy() + + return result, mask + + +def kleene_xor( + left: Union[bool, np.ndarray], + right: Union[bool, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``xor`` using Kleene logic. + + This is the same as ``or``, with the following adjustments + + * True, True -> False + * True, NA -> NA + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + if left_mask is None: + return kleene_xor(right, left, right_mask, left_mask) + + raise_for_nan(right, method="xor") + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left ^ right + + if right_mask is None: + if right is libmissing.NA: + mask = np.ones_like(left_mask) + else: + mask = left_mask.copy() + else: + mask = left_mask | right_mask + + return result, mask + + +def kleene_and( + left: Union[bool, libmissing.NAType, np.ndarray], + right: Union[bool, libmissing.NAType, np.ndarray], + left_mask: Optional[np.ndarray], + right_mask: Optional[np.ndarray], +): + """ + Boolean ``and`` using Kleene logic. + + Values are ``NA`` for ``NA & NA`` or ``True & NA``. + + Parameters + ---------- + left, right : ndarray, NA, or bool + The values of the array. + left_mask, right_mask : ndarray, optional + The masks. Only one of these may be None, which implies that + the associated `left` or `right` value is a scalar. + + Returns + ------- + result, mask: ndarray[bool] + The result of the logical xor, and the new mask. + """ + # To reduce the number of cases, we ensure that `left` & `left_mask` + # always come from an array, not a scalar. This is safe, since because + # A | B == B | A + if left_mask is None: + return kleene_and(right, left, right_mask, left_mask) + + assert isinstance(left, np.ndarray) + raise_for_nan(right, method="and") + + if right is libmissing.NA: + result = np.zeros_like(left) + else: + result = left & right + + if right_mask is None: + # Scalar `right` + if right is libmissing.NA: + mask = (left & ~left_mask) | left_mask + + else: + mask = left_mask.copy() + if right is False: + # unmask everything + mask[:] = False + else: + # unmask where either left or right is False + left_false = ~(left | left_mask) + right_false = ~(right | right_mask) + mask = (left_mask & ~right_false) | (right_mask & ~left_false) + + return result, mask + + +def raise_for_nan(value, method): + if lib.is_float(value) and np.isnan(value): + raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 477c847fb01e6..c04658565f235 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -102,7 +102,8 @@ def f(self, other): return self - f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) + name = method.__name__.strip("__") + f.__name__ = f"__i{name}__" return f new_methods.update( @@ -162,7 +163,6 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): have_divmod = issubclass(cls, ABCSeries) # divmod is available for Series - # yapf: disable new_methods = dict( add=arith_method(cls, operator.add, special), radd=arith_method(cls, radd, special), @@ -181,8 +181,8 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): rtruediv=arith_method(cls, rtruediv, special), rfloordiv=arith_method(cls, rfloordiv, special), rpow=arith_method(cls, rpow, special), - rmod=arith_method(cls, rmod, special)) - # yapf: enable + rmod=arith_method(cls, rmod, special), + ) new_methods["div"] = new_methods["truediv"] new_methods["rdiv"] = new_methods["rtruediv"] if have_divmod: @@ -215,7 +215,7 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): ) if special: - dunderize = lambda x: "__{name}__".format(name=x.strip("_")) + dunderize = lambda x: f"__{x.strip('_')}__" else: dunderize = lambda x: x new_methods = {dunderize(k): v for k, v in new_methods.items()} diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 3bb7bb022dd3a..5039ffab33fbd 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -27,7 +27,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar -from .roperator import rdivmod, rfloordiv, rmod +from pandas.core.ops.roperator import rdivmod, rfloordiv, rmod def fill_zeros(result, x, y): @@ -168,7 +168,7 @@ def dispatch_fill_zeros(op, left, right, result): # Note: no need to do this for truediv; in py3 numpy behaves the way # we want. result = mask_zero_div_zero(left, right, result) - elif op is op is rfloordiv: + elif op is rfloordiv: # Note: no need to do this for rtruediv; in py3 numpy behaves the way # we want. result = mask_zero_div_zero(right, left, result) diff --git a/pandas/core/ops/roperator.py b/pandas/core/ops/roperator.py index 4cb02238aea16..e6691ddf8984e 100644 --- a/pandas/core/ops/roperator.py +++ b/pandas/core/ops/roperator.py @@ -34,9 +34,8 @@ def rmod(left, right): # formatting operation; this is a TypeError # otherwise perform the op if isinstance(right, str): - raise TypeError( - "{typ} cannot perform the operation mod".format(typ=type(left).__name__) - ) + typ = type(left).__name__ + raise TypeError(f"{typ} cannot perform the operation mod") return right % left diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 13cb0f9aed303..0e43880dfda07 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2,7 +2,6 @@ from datetime import timedelta from textwrap import dedent from typing import Dict, no_type_check -import warnings import numpy as np @@ -17,11 +16,11 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos -from pandas.core.base import DataError +from pandas.core.base import DataError, ShallowMixin from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range @@ -31,10 +30,10 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs = dict() # type: Dict[str, str] +_shared_docs_kwargs: Dict[str, str] = dict() -class Resampler(_GroupBy): +class Resampler(_GroupBy, ShallowMixin): """ Class for resampling datetimelike data, a groupby-like operation. See aggregate, transform, and apply functions on this object. @@ -86,20 +85,18 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): if self.groupby is not None: self.groupby._set_grouper(self._convert_obj(obj), sort=True) - def __str__(self): + def __str__(self) -> str: """ Provide a nice str repr of our rolling object. """ attrs = ( - "{k}={v}".format(k=k, v=getattr(self.groupby, k)) + f"{k}={getattr(self.groupby, k)}" for k in self._attributes if getattr(self.groupby, k, None) is not None ) - return "{klass} [{attrs}]".format( - klass=self.__class__.__name__, attrs=", ".join(attrs) - ) + return f"{type(self).__name__} [{', '.join(attrs)}]" - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self._attributes: @@ -134,7 +131,7 @@ def ax(self): return self.groupby.ax @property - def _typ(self): + def _typ(self) -> str: """ Masquerade for compat as a Series or a DataFrame. """ @@ -143,7 +140,7 @@ def _typ(self): return "dataframe" @property - def _from_selection(self): + def _from_selection(self) -> bool: """ Is the resampling from a DataFrame column or MultiIndex level. """ @@ -187,6 +184,7 @@ def _get_binner(self): """ binner, bins, binlabels = self._get_binner_for_time() + assert len(bins) == len(binlabels) bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer) return binner, bin_grouper @@ -318,7 +316,7 @@ def _downsample(self, f): def _upsample(self, f, limit=None, fill_value=None): raise AbstractMethodError(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim: int, subset=None): """ Sub-classes to define. Return a sliced object. @@ -334,7 +332,7 @@ def _gotitem(self, key, ndim, subset=None): grouper = self.grouper if subset is None: subset = self.obj - grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(subset, by=None, grouper=grouper, axis=self.axis) # try the key selection try: @@ -353,7 +351,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): obj = self._selected_obj - grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis) + grouped = get_groupby(obj, by=None, grouper=grouper, axis=self.axis) try: if isinstance(obj, ABCDataFrame) and callable(how): @@ -361,8 +359,6 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except AssertionError: - raise except DataError: # we have a non-reducing function; try to evaluate result = grouped.apply(how, *args, **kwargs) @@ -443,7 +439,7 @@ def pad(self, limit=None): Parameters ---------- limit : int, optional - limit of how many values to fill + Limit of how many values to fill. Returns ------- @@ -795,7 +791,7 @@ def interpolate( limit_direction="forward", limit_area=None, downcast=None, - **kwargs + **kwargs, ): """ Interpolate values according to different methods. @@ -809,7 +805,7 @@ def interpolate( limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, - **kwargs + **kwargs, ) def asfreq(self, fill_value=None): @@ -858,7 +854,7 @@ def var(self, ddof=1, *args, **kwargs): Parameters ---------- ddof : int, default 1 - degrees of freedom + Degrees of freedom. Returns ------- @@ -870,13 +866,32 @@ def var(self, ddof=1, *args, **kwargs): @Appender(GroupBy.size.__doc__) def size(self): - # It's a special case as higher level does return - # a copy of 0-len objects. GH14962 result = self._downsample("size") - if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame): + if not len(self.ax): from pandas import Series - result = Series([], index=result.index, dtype="int64") + if self._selected_obj.ndim == 1: + name = self._selected_obj.name + else: + name = None + result = Series([], index=result.index, dtype="int64", name=name) + return result + + @Appender(GroupBy.count.__doc__) + def count(self): + result = self._downsample("count") + if not len(self.ax): + if self._selected_obj.ndim == 1: + result = type(self._selected_obj)( + [], index=result.index, dtype="int64", name=self._selected_obj.name + ) + else: + from pandas import DataFrame + + result = DataFrame( + [], index=result.index, columns=result.columns, dtype="int64" + ) + return result def quantile(self, q=0.5, **kwargs): @@ -924,14 +939,6 @@ def g(self, _method=method, *args, **kwargs): g.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, g) -# groupby & aggregate methods -for method in ["count"]: - - def h(self, _method=method): - return self._downsample(_method) - - h.__doc__ = getattr(GroupBy, method).__doc__ - setattr(Resampler, method, h) # series only methods for method in ["nunique"]: @@ -943,58 +950,6 @@ def h(self, _method=method): setattr(Resampler, method, h) -def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): - """ - Potentially we might have a deprecation warning, show it - but call the appropriate methods anyhow. - """ - - if how is not None: - - # .resample(..., how='sum') - if isinstance(how, str): - method = "{0}()".format(how) - - # .resample(..., how=lambda x: ....) - else: - method = ".apply()" - - # if we have both a how and fill_method, then show - # the following warning - if fill_method is None: - warnings.warn( - "how in .resample() is deprecated\n" - "the new syntax is " - ".resample(...).{method}".format(method=method), - FutureWarning, - stacklevel=3, - ) - r = r.aggregate(how) - - if fill_method is not None: - - # show the prior function call - method = "." + method if how is not None else "" - - args = "limit={0}".format(limit) if limit is not None else "" - warnings.warn( - "fill_method is deprecated to .resample()\n" - "the new syntax is .resample(...){method}" - ".{fill_method}({args})".format( - method=method, fill_method=fill_method, args=args - ), - FutureWarning, - stacklevel=3, - ) - - if how is not None: - r = getattr(r, fill_method)(limit=limit) - else: - r = r.aggregate(fill_method, limit=limit) - - return r - - class _GroupByMixin(GroupByMixin): """ Provide the groupby facilities. @@ -1070,7 +1025,7 @@ def _downsample(self, how, **kwargs): if not len(ax): # reset to the new freq obj = obj.copy() - obj.index.freq = self.freq + obj.index._set_freq(self.freq) return obj # do we have a regular frequency @@ -1121,10 +1076,9 @@ def _upsample(self, method, limit=None, fill_value=None): raise AssertionError("axis must be 0") if self._from_selection: raise ValueError( - "Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like" + "Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to datetime-like" ) ax = self.ax @@ -1180,9 +1134,9 @@ def _convert_obj(self, obj): if self._from_selection: # see GH 14008, GH 12871 msg = ( - "Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index" + "Resampling from level= or on= selection " + "with a PeriodIndex is not currently supported, " + "use .set_index(...) to explicitly set index" ) raise NotImplementedError(msg) @@ -1230,8 +1184,8 @@ def _downsample(self, how, **kwargs): return self.asfreq() raise IncompatibleFrequency( - "Frequency {} cannot be resampled to {}, as they are not " - "sub or super periods".format(ax.freq, self.freq) + f"Frequency {ax.freq} cannot be resampled to {self.freq}, " + "as they are not sub or super periods" ) def _upsample(self, method, limit=None, fill_value=None): @@ -1239,11 +1193,11 @@ def _upsample(self, method, limit=None, fill_value=None): Parameters ---------- method : string {'backfill', 'bfill', 'pad', 'ffill'} - method for upsampling + Method for upsampling. limit : int, default None - Maximum size gap to fill when reindexing + Maximum size gap to fill when reindexing. fill_value : scalar, default None - Value to use for missing values + Value to use for missing values. See Also -------- @@ -1331,8 +1285,7 @@ def get_resampler_for_grouping( tg = TimeGrouper(freq=rule, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) - r = resampler._get_resampler_for_grouping(groupby=groupby) - return _maybe_process_deprecations(r, how=how, fill_method=fill_method, limit=limit) + return resampler._get_resampler_for_grouping(groupby=groupby) class TimeGrouper(Grouper): @@ -1371,16 +1324,16 @@ def __init__( kind=None, convention=None, base=0, - **kwargs + **kwargs, ): # Check for correctness of the keyword arguments which would # otherwise silently use the default if misspelled if label not in {None, "left", "right"}: - raise ValueError("Unsupported value {} for `label`".format(label)) + raise ValueError(f"Unsupported value {label} for `label`") if closed not in {None, "left", "right"}: - raise ValueError("Unsupported value {} for `closed`".format(closed)) + raise ValueError(f"Unsupported value {closed} for `closed`") if convention not in {None, "start", "end", "e", "s"}: - raise ValueError("Unsupported value {} for `convention`".format(convention)) + raise ValueError(f"Unsupported value {convention} for `convention`") freq = to_offset(freq) @@ -1450,10 +1403,10 @@ def _get_resampler(self, obj, kind=None): raise TypeError( "Only valid with DatetimeIndex, " "TimedeltaIndex or PeriodIndex, " - "but got an instance of %r" % type(ax).__name__ + f"but got an instance of '{type(ax).__name__}'" ) - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() @@ -1463,7 +1416,7 @@ def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError( "axis must be a DatetimeIndex, but got " - "an instance of %r" % type(ax).__name__ + f"an instance of {type(ax).__name__}" ) if len(ax) == 0: @@ -1539,7 +1492,7 @@ def _get_time_delta_bins(self, ax): if not isinstance(ax, TimedeltaIndex): raise TypeError( "axis must be a TimedeltaIndex, but got " - "an instance of %r" % type(ax).__name__ + f"an instance of {type(ax).__name__}" ) if not len(ax): @@ -1564,7 +1517,7 @@ def _get_time_period_bins(self, ax): if not isinstance(ax, DatetimeIndex): raise TypeError( "axis must be a DatetimeIndex, but got " - "an instance of %r" % type(ax).__name__ + f"an instance of {type(ax).__name__}" ) freq = self.freq @@ -1586,7 +1539,7 @@ def _get_period_bins(self, ax): if not isinstance(ax, PeriodIndex): raise TypeError( "axis must be a PeriodIndex, but got " - "an instance of %r" % type(ax).__name__ + f"an instance of {type(ax).__name__}" ) memb = ax.asfreq(self.freq, how=self.convention) @@ -1633,7 +1586,10 @@ def _get_period_bins(self, ax): rng += freq_mult # adjust bin edge indexes to account for base rng -= bin_shift - bins = memb.searchsorted(rng, side="left") + + # Wrap in PeriodArray for PeriodArray.searchsorted + prng = type(memb._data)(rng, dtype=memb.dtype) + bins = memb.searchsorted(prng, side="left") if nat_count > 0: # NaT handling as in pandas._lib.lib.generate_bins_dt64() @@ -1745,8 +1701,8 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): # GH 23882 first = first.to_timestamp() last = last.to_timestamp() - adjust_first = not offset.onOffset(first) - adjust_last = offset.onOffset(last) + adjust_first = not offset.is_on_offset(first) + adjust_last = offset.is_on_offset(last) first, last = _get_timestamp_range_edges( first, last, offset, closed=closed, base=base diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 60bab817705e3..502b8d1941fdf 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,22 +2,24 @@ concat routines """ -import warnings +from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload import numpy as np +from pandas._typing import FrameOrSeriesUnion + from pandas import DataFrame, Index, MultiIndex, Series -from pandas.core import common as com from pandas.core.arrays.categorical import ( - _factorize_from_iterable, - _factorize_from_iterables, + factorize_from_iterable, + factorize_from_iterables, ) +import pandas.core.common as com from pandas.core.generic import NDFrame -from pandas.core.index import ( - _all_indexes_same, - _get_consensus_names, - _get_objs_combined_axis, +from pandas.core.indexes.api import ( + all_indexes_same, ensure_index, + get_consensus_names, + get_objs_combined_axis, ) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers @@ -26,19 +28,54 @@ # Concatenate DataFrame objects +@overload def concat( - objs, + objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> "DataFrame": + ... + + +@overload +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: + ... + + +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], axis=0, join="outer", - join_axes=None, - ignore_index=False, + ignore_index: bool = False, keys=None, levels=None, names=None, - verify_integrity=False, - sort=None, - copy=True, -): + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -58,12 +95,6 @@ def concat( The axis to concatenate along. join : {'inner', 'outer'}, default 'outer' How to handle indexes on other axis (or axes). - join_axes : list of Index objects - .. deprecated:: 0.25.0 - - Specific indexes to use for the other n - 1 axes instead of performing - inner/outer set logic. Use .reindex() before or after concatenation - as a replacement. ignore_index : bool, default False If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are @@ -81,18 +112,16 @@ def concat( verify_integrity : bool, default False Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. - sort : bool, default None + sort : bool, default False Sort non-concatenation axis if it is not already aligned when `join` - is 'outer'. The current default of sorting is deprecated and will - change to not-sorting in a future version of pandas. - - Explicitly pass ``sort=True`` to silence the warning and sort. - Explicitly pass ``sort=False`` to silence the warning and not sort. - + is 'outer'. This has no effect when ``join='inner'``, which already preserves the order of the non-concatenation axis. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Changed to not sort by default. copy : bool, default True If False, do not copy data unnecessarily. @@ -118,7 +147,7 @@ def concat( A walkthrough of how this method fits in with other tools for combining pandas objects can be found `here - `__. + `__. Examples -------- @@ -244,7 +273,6 @@ def concat( axis=axis, ignore_index=ignore_index, join=join, - join_axes=join_axes, keys=keys, levels=levels, names=names, @@ -265,14 +293,13 @@ def __init__( self, objs, axis=0, - join="outer", - join_axes=None, + join: str = "outer", keys=None, levels=None, names=None, - ignore_index=False, - verify_integrity=False, - copy=True, + ignore_index: bool = False, + verify_integrity: bool = False, + copy: bool = True, sort=False, ): if isinstance(objs, (NDFrame, str)): @@ -293,7 +320,7 @@ def __init__( if isinstance(objs, dict): if keys is None: - keys = com.dict_keys_to_ordered_list(objs) + keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) @@ -324,8 +351,8 @@ def __init__( for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( - "cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid".format(type(obj)) + "cannot concatenate object of type '{typ}'; " + "only Series and DataFrame objs are valid".format(typ=type(obj)) ) raise TypeError(msg) @@ -375,8 +402,8 @@ def __init__( self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: raise AssertionError( - "axis must be between 0 and {ndim}, input was" - " {axis}".format(ndim=sample.ndim, axis=axis) + "axis must be between 0 and {ndim}, input was " + "{axis}".format(ndim=sample.ndim, axis=axis) ) # if we have mixed ndims, then convert to highest ndim @@ -413,7 +440,6 @@ def __init__( # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis - self.join_axes = join_axes self.keys = keys self.names = names or getattr(keys, "names", None) self.levels = levels @@ -437,13 +463,13 @@ def get_result(self): mgr = self.objs[0]._data.concat( [x._data for x in self.objs], self.new_axes ) - cons = _get_series_result_type(mgr, self.objs) + cons = self.objs[0]._constructor return cons(mgr, name=name).__finalize__(self, method="concat") # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) - cons = _get_series_result_type(data) + cons = DataFrame index, columns = self.new_axes df = cons(data, index=index) @@ -473,64 +499,31 @@ def get_result(self): if not self.copy: new_data._consolidate_inplace() - cons = _get_frame_result_type(new_data, self.objs) + cons = self.objs[0]._constructor return cons._from_axes(new_data, self.new_axes).__finalize__( self, method="concat" ) - def _get_result_dim(self): + def _get_result_dim(self) -> int: if self._is_series and self.axis == 1: return 2 else: return self.objs[0].ndim - def _get_new_axes(self): + def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() - new_axes = [None] * ndim - - if self.join_axes is None: - for i in range(ndim): - if i == self.axis: - continue - new_axes[i] = self._get_comb_axis(i) + return [ + self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + for i in range(ndim) + ] - else: - # GH 21951 - warnings.warn( - "The join_axes-keyword is deprecated. Use .reindex or " - ".reindex_like on the result to achieve the same " - "functionality.", - FutureWarning, - stacklevel=4, - ) - - if len(self.join_axes) != ndim - 1: - raise AssertionError( - "length of join_axes must be equal " - "to {length}".format(length=ndim - 1) - ) - - # ufff... - indices = list(range(ndim)) - indices.remove(self.axis) - - for i, ax in zip(indices, self.join_axes): - new_axes[i] = ax - - new_axes[self.axis] = self._get_concat_axis() - return new_axes - - def _get_comb_axis(self, i): + def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) - try: - return _get_objs_combined_axis( - self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort - ) - except IndexError: - types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of {types}".format(types=types)) + return get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) - def _get_concat_axis(self): + def _get_concat_axis(self) -> Index: """ Return index to be used along concatenation axis. """ @@ -541,14 +534,14 @@ def _get_concat_axis(self): idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names = [None] * len(self.objs) + names: List[Optional[Hashable]] = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): raise TypeError( - "Cannot concatenate type 'Series' " - "with object of type {type!r}".format(type=type(x).__name__) + f"Cannot concatenate type 'Series' with " + f"object of type '{type(x).__name__}'" ) if x.name is not None: names[i] = x.name @@ -580,7 +573,7 @@ def _get_concat_axis(self): return concat_axis - def _maybe_check_integrity(self, concat_index): + def _maybe_check_integrity(self, concat_index: Index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() @@ -590,11 +583,11 @@ def _maybe_check_integrity(self, concat_index): ) -def _concat_indexes(indexes): +def _concat_indexes(indexes) -> Index: return indexes[0].append(indexes[1:]) -def _make_concat_multiindex(indexes, keys, levels=None, names=None): +def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex: if (levels is None and isinstance(keys[0], tuple)) or ( levels is not None and len(levels) > 1 @@ -604,7 +597,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - _, levels = _factorize_from_iterables(zipped) + _, levels = factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: @@ -617,7 +610,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): else: levels = [ensure_index(x) for x in levels] - if not _all_indexes_same(indexes): + if not all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes @@ -645,7 +638,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: - codes, categories = _factorize_from_iterable(concat_index) + codes, categories = factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) @@ -655,12 +648,12 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( - "Cannot concat indices that do" - " not have the same number of levels" + "Cannot concat indices that do " + "not have the same number of levels" ) # also copies - names = names + _get_consensus_names(indexes) + names = names + get_consensus_names(indexes) return MultiIndex( levels=levels, codes=codes_list, names=names, verify_integrity=False @@ -706,28 +699,3 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): return MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) - - -def _get_series_result_type(result, objs=None): - """ - return appropriate class of Series concat - input is either dict or array-like - """ - # TODO: See if we can just inline with _constructor_expanddim - # now that sparse is removed. - from pandas import DataFrame - - # concat Series with axis 1 - if isinstance(result, dict): - return DataFrame - - # otherwise it is a SingleBlockManager (axis = 0) - return objs[0]._constructor - - -def _get_frame_result_type(result, objs): - """ - return appropriate class of DataFrame-like concat - """ - # TODO: just inline this as _constructor. - return objs[0] diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index c85050bc4232b..d4ccb19fc0dda 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,16 +1,18 @@ import re +from typing import List import numpy as np -from pandas.util._decorators import Appender +from pandas.util._decorators import Appender, deprecate_kwarg -from pandas.core.dtypes.common import is_extension_type, is_list_like +from pandas.core.dtypes.common import is_extension_array_dtype, is_list_like from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import notna from pandas.core.arrays import Categorical -from pandas.core.frame import _shared_docs +import pandas.core.common as com +from pandas.core.frame import DataFrame, _shared_docs from pandas.core.indexes.base import Index from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric @@ -21,13 +23,13 @@ % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") ) def melt( - frame, + frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, -): +) -> DataFrame: # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -35,6 +37,7 @@ def melt( cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) + if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] @@ -45,11 +48,11 @@ def melt( else: # Check that `id_vars` are in frame id_vars = list(id_vars) - missing = Index(np.ravel(id_vars)).difference(cols) + missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'id_vars' are not present" - " in the DataFrame: {missing}" + "The following 'id_vars' are not present " + "in the DataFrame: {missing}" "".format(missing=list(missing)) ) else: @@ -67,11 +70,11 @@ def melt( else: value_vars = list(value_vars) # Check that `value_vars` are in frame - missing = Index(np.ravel(value_vars)).difference(cols) + missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'value_vars' are not present in" - " the DataFrame: {missing}" + "The following 'value_vars' are not present in " + "the DataFrame: {missing}" "".format(missing=list(missing)) ) frame = frame.loc[:, id_vars + value_vars] @@ -103,7 +106,7 @@ def melt( mdata = {} for col in id_vars: id_data = frame.pop(col) - if is_extension_type(id_data): + if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: id_data = np.tile(id_data.values, K) @@ -119,7 +122,8 @@ def melt( return frame._constructor(mdata, columns=mcolumns) -def lreshape(data, groups, dropna=True, label=None): +@deprecate_kwarg(old_arg_name="label", new_arg_name=None) +def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot @@ -188,7 +192,9 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): +def wide_to_long( + df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> DataFrame: r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -206,12 +212,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): Parameters ---------- df : DataFrame - The wide-format DataFrame + The wide-format DataFrame. stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. i : str or list-like - Column(s) to use as id variable(s) + Column(s) to use as id variable(s). j : str The name of the sub-observation variable. What you wish to name your suffix in the long format. @@ -219,14 +225,14 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you - can strip the hyphen by specifying `sep='-'` + can strip the hyphen by specifying `sep='-'`. suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form A-one, B-two,.., and you have an unrelated column A-rating, you can - ignore the last one by specifying `suffix='(!?one|two)'` + ignore the last one by specifying `suffix='(!?one|two)'`. .. versionchanged:: 0.23.0 When all suffixes are numeric, they are cast to int64/float64. @@ -360,7 +366,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): >>> stubnames = sorted( ... set([match[0] for match in df.columns.str.findall( - ... r'[A-B]\(.*\)').values if match != [] ]) + ... r'[A-B]\(.*\)').values if match != []]) ... ) >>> list(stubnames) ['A(weekly)', 'B(weekly)'] @@ -412,14 +418,14 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): two 2.9 """ - def get_var_names(df, stub, sep, suffix): + def get_var_names(df, stub: str, sep: str, suffix: str) -> List[str]: regex = r"^{stub}{sep}{suffix}$".format( stub=re.escape(stub), sep=re.escape(sep), suffix=suffix ) pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] - def melt_stub(df, stub, i, j, value_vars, sep): + def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( df, id_vars=i, @@ -456,8 +462,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] - melted = melted[0].join(melted[1:], how="outer") + _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] + melted = _melted[0].join(_melted[1:], how="outer") if len(i) == 1: new = df[id_vars].set_index(i).join(melted) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7e593ddb91d3a..5f92e4a88b568 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,12 +6,14 @@ import datetime from functools import partial import string +from typing import TYPE_CHECKING, Optional, Tuple, Union import warnings import numpy as np -from pandas._libs import hashtable as libhashtable, lib +from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin +from pandas._typing import FrameOrSeries from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -24,7 +26,6 @@ is_bool_dtype, is_categorical_dtype, is_datetime64tz_dtype, - is_datetimelike, is_dtype_equal, is_extension_array_dtype, is_float_dtype, @@ -36,35 +37,39 @@ is_object_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.missing import isnull, na_value_for_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna, na_value_for_dtype -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta +from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com from pandas.core.frame import _merge_doc from pandas.core.internals import _transform_index, concatenate_block_managers -import pandas.core.sorting as sorting from pandas.core.sorting import is_int64_overflow_possible +if TYPE_CHECKING: + from pandas import DataFrame, Series # noqa:F401 + @Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) def merge( left, right, - how="inner", + how: str = "inner", on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, - sort=False, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, -): +) -> "DataFrame": op = _MergeOperation( left, right, @@ -87,7 +92,9 @@ def merge( merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): +def _groupby_and_merge( + by, on, left, right: "DataFrame", _merge_pieces, check_duplicates: bool = True +): """ groupby & merge; we are always performing a left-by type operation @@ -107,23 +114,27 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf - try: - - # we will check & remove duplicates if indicated - if check_duplicates: - if on is None: - on = [] - elif not isinstance(on, (list, tuple)): - on = [on] - if right.duplicated(by + on).any(): - right = right.drop_duplicates(by + on, keep="last") + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + _right = right.drop_duplicates(by + on, keep="last") + # TODO: use overload to refine return type of drop_duplicates + assert _right is not None # needed for mypy + right = _right + try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: @@ -173,12 +184,13 @@ def merge_ordered( right_by=None, fill_method=None, suffixes=("_x", "_y"), - how="outer", -): + how: str = "outer", +) -> "DataFrame": """ - Perform merge with optional filling/interpolation designed for ordered - data like time series data. Optionally perform group-wise merge (see - examples). + Perform merge with optional filling/interpolation. + + Designed for ordered data like time series data. Optionally + perform group-wise merge (see examples). Parameters ---------- @@ -189,18 +201,18 @@ def merge_ordered( left_on : label or list, or array-like Field names to join on in left DataFrame. Can be a vector or list of vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + the join key instead of columns. right_on : label or list, or array-like Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + left_on docs. left_by : column name or list of column names Group left DataFrame by group columns and merge piece by piece with - right DataFrame + right DataFrame. right_by : column name or list of column names Group right DataFrame by group columns and merge piece by piece with - left DataFrame + left DataFrame. fill_method : {'ffill', None}, default None - Interpolation method for data + Interpolation method for data. suffixes : Sequence, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in @@ -214,13 +226,13 @@ def merge_ordered( * left: use only keys from left frame (SQL: left outer join) * right: use only keys from right frame (SQL: right outer join) * outer: use union of keys from both frames (SQL: full outer join) - * inner: use intersection of keys from both frames (SQL: inner join) + * inner: use intersection of keys from both frames (SQL: inner join). Returns ------- - merged : DataFrame - The output type will the be same as 'left', if it is a subclass - of DataFrame. + DataFrame + The merged DataFrame output type will the be same as + 'left', if it is a subclass of DataFrame. See Also -------- @@ -229,15 +241,21 @@ def merge_ordered( Examples -------- - >>> A >>> B - key lvalue group key rvalue - 0 a 1 a 0 b 1 - 1 c 2 a 1 c 2 - 2 e 3 a 2 d 3 + >>> A + key lvalue group + 0 a 1 a + 1 c 2 a + 2 e 3 a 3 a 1 b 4 c 2 b 5 e 3 b + >>> B + Key rvalue + 0 b 1 + 1 c 2 + 2 d 3 + >>> merge_ordered(A, B, fill_method='ffill', left_by='group') group key lvalue rvalue 0 a a 1 NaN @@ -292,16 +310,16 @@ def merge_asof( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, suffixes=("_x", "_y"), tolerance=None, - allow_exact_matches=True, - direction="backward", -): + allow_exact_matches: bool = True, + direction: str = "backward", +) -> "DataFrame": """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -359,7 +377,7 @@ def merge_asof( - If True, allow matching with the same 'on' value (i.e. less-than-or-equal-to / greater-than-or-equal-to) - If False, don't match the same 'on' value - (i.e., strictly less-than / strictly greater-than) + (i.e., strictly less-than / strictly greater-than). direction : 'backward' (default), 'forward', or 'nearest' Whether to search for prior, subsequent, or closest matches. @@ -527,33 +545,33 @@ def merge_asof( # TODO: only copy DataFrames when modification necessary class _MergeOperation: """ - Perform a database (SQL) merge operation between two DataFrame objects - using either columns as keys or their row indexes + Perform a database (SQL) merge operation between two DataFrame or Series + objects using either columns as keys or their row indexes """ _merge_type = "merge" def __init__( self, - left, - right, - how="inner", + left: Union["Series", "DataFrame"], + right: Union["Series", "DataFrame"], + how: str = "inner", on=None, left_on=None, right_on=None, axis=1, - left_index=False, - right_index=False, - sort=True, + left_index: bool = False, + right_index: bool = False, + sort: bool = True, suffixes=("_x", "_y"), - copy=True, - indicator=False, + copy: bool = True, + indicator: bool = False, validate=None, ): - left = validate_operand(left) - right = validate_operand(right) - self.left = self.orig_left = left - self.right = self.orig_right = right + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _left + self.right = self.orig_right = _right self.how = how self.axis = axis @@ -570,6 +588,7 @@ def __init__( self.indicator = indicator + self.indicator_name: Optional[str] if isinstance(self.indicator, str): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): @@ -591,11 +610,11 @@ def __init__( ) # warn user when merging between different levels - if left.columns.nlevels != right.columns.nlevels: + if _left.columns.nlevels != _right.columns.nlevels: msg = ( "merging between different levels can give an unintended " "result ({left} levels on the left, {right} on the right)" - ).format(left=left.columns.nlevels, right=right.columns.nlevels) + ).format(left=_left.columns.nlevels, right=_right.columns.nlevels) warnings.warn(msg, UserWarning) self._validate_specification() @@ -652,7 +671,9 @@ def get_result(self): return result - def _indicator_pre_merge(self, left, right): + def _indicator_pre_merge( + self, left: "DataFrame", right: "DataFrame" + ) -> Tuple["DataFrame", "DataFrame"]: columns = left.columns.union(right.columns) @@ -872,7 +893,12 @@ def _get_join_info(self): return join_index, left_indexer, right_indexer def _create_join_index( - self, index, other_index, indexer, other_indexer, how="left" + self, + index: Index, + other_index: Index, + indexer, + other_indexer, + how: str = "left", ): """ Create a join index by rearranging one index to match another @@ -1006,7 +1032,7 @@ def _get_merge_keys(self): ) ] else: - left_keys = [self.left.index.values] + left_keys = [self.left.index._values] if left_drop: self.left = self.left._drop_labels_or_levels(left_drop) @@ -1120,9 +1146,9 @@ def _maybe_coerce_merge_keys(self): raise ValueError(msg) # datetimelikes must match exactly - elif is_datetimelike(lk) and not is_datetimelike(rk): + elif needs_i8_conversion(lk) and not needs_i8_conversion(rk): raise ValueError(msg) - elif not is_datetimelike(lk) and is_datetimelike(rk): + elif not needs_i8_conversion(lk) and needs_i8_conversion(rk): raise ValueError(msg) elif is_datetime64tz_dtype(lk) and not is_datetime64tz_dtype(rk): raise ValueError(msg) @@ -1173,9 +1199,7 @@ def _validate_specification(self): ) ) if not common_cols.is_unique: - raise MergeError( - "Data columns not unique: {common!r}".format(common=common_cols) - ) + raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: if self.left_on is not None or self.right_on is not None: @@ -1205,7 +1229,7 @@ def _validate_specification(self): if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") - def _validate(self, validate): + def _validate(self, validate: str): # Check uniqueness of each if self.left_index: @@ -1222,32 +1246,32 @@ def _validate(self, validate): if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: raise MergeError( - "Merge keys are not unique in either left" - " or right dataset; not a one-to-one merge" + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" ) elif not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-one merge" + "Merge keys are not unique in left dataset; " + "not a one-to-one merge" ) elif not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in right dataset; " + "not a one-to-one merge" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-many merge" + "Merge keys are not unique in left dataset; " + "not a one-to-many merge" ) elif validate in ["many_to_one", "m:1"]: if not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a many-to-one merge" + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" ) elif validate in ["many_to_many", "m:m"]: @@ -1257,7 +1281,9 @@ def _validate(self, validate): raise ValueError("Not a valid argument for validate") -def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): +def _get_join_indexers( + left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs +): """ Parameters @@ -1277,11 +1303,13 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) right_keys ), "left_key and right_keys must be the same length" - # bind `sort` arg. of _factorize_keys - fkeys = partial(_factorize_keys, sort=sort) - # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys))) + mapped = ( + _factorize_keys(left_keys[n], right_keys[n], sort=sort) + for n in range(len(left_keys)) + ) + zipped = zip(*mapped) + llab, rlab, shape = [list(x) for x in zipped] # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1289,7 +1317,7 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) @@ -1301,7 +1329,12 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs) def _restore_dropped_levels_multijoin( - left, right, dropped_level_names, join_index, lindexer, rindexer + left: MultiIndex, + right: MultiIndex, + dropped_level_names, + join_index, + lindexer, + rindexer, ): """ *this is an internal non-public method* @@ -1339,7 +1372,7 @@ def _restore_dropped_levels_multijoin( """ - def _convert_to_mulitindex(index): + def _convert_to_mulitindex(index) -> MultiIndex: if isinstance(index, MultiIndex): return index else: @@ -1399,13 +1432,13 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="outer", + how: str = "outer", ): self.fill_method = fill_method @@ -1460,12 +1493,12 @@ def get_result(self): return result -def _asof_function(direction): +def _asof_function(direction: str): name = "asof_join_{dir}".format(dir=direction) return getattr(libjoin, name, None) -def _asof_by_function(direction): +def _asof_by_function(direction: str): name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) return getattr(libjoin, name, None) @@ -1497,19 +1530,19 @@ def __init__( on=None, left_on=None, right_on=None, - left_index=False, - right_index=False, + left_index: bool = False, + right_index: bool = False, by=None, left_by=None, right_by=None, axis=1, suffixes=("_x", "_y"), - copy=True, + copy: bool = True, fill_method=None, - how="asof", + how: str = "asof", tolerance=None, - allow_exact_matches=True, - direction="backward", + allow_exact_matches: bool = True, + direction: str = "backward", ): self.by = by @@ -1632,7 +1665,7 @@ def _get_merge_keys(self): ) ) - if is_datetimelike(lt): + if needs_i8_conversion(lt): if not isinstance(self.tolerance, datetime.timedelta): raise MergeError(msg) if self.tolerance < Timedelta(0): @@ -1687,13 +1720,13 @@ def flip(xs): msg_missings = "Merge keys contain null values on {side} side" if not Index(left_values).is_monotonic: - if isnull(left_values).any(): + if isna(left_values).any(): raise ValueError(msg_missings.format(side="left")) else: raise ValueError(msg_sorted.format(side="left")) if not Index(right_values).is_monotonic: - if isnull(right_values).any(): + if isna(right_values).any(): raise ValueError(msg_missings.format(side="right")) else: raise ValueError(msg_sorted.format(side="right")) @@ -1746,13 +1779,15 @@ def flip(xs): return func(left_values, right_values, self.allow_exact_matches, tolerance) -def _get_multiindex_indexer(join_keys, index, sort): - - # bind `sort` argument - fkeys = partial(_factorize_keys, sort=sort) +def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): # left & right join labels and num. of levels at each location - rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) + mapped = ( + _factorize_keys(index.levels[n], join_keys[n], sort=sort) + for n in range(index.nlevels) + ) + zipped = zip(*mapped) + rcodes, lcodes, shape = [list(x) for x in zipped] if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: @@ -1775,12 +1810,12 @@ def _get_multiindex_indexer(join_keys, index, sort): lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) # factorize keys to a dense i8 space - lkey, rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) return libjoin.left_outer_join(lkey, rkey, count, sort=sort) -def _get_single_indexer(join_key, index, sort=False): +def _get_single_indexer(join_key, index, sort: bool = False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( @@ -1790,7 +1825,7 @@ def _get_single_indexer(join_key, index, sort=False): return left_indexer, right_indexer -def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): +def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) @@ -1904,7 +1939,7 @@ def _factorize_keys(lk, rk, sort=True): return llab, rlab, count -def _sort_labels(uniques, left, right): +def _sort_labels(uniques: np.ndarray, left, right): if not isinstance(uniques, np.ndarray): # tuplesafe uniques = Index(uniques).values @@ -1912,14 +1947,14 @@ def _sort_labels(uniques, left, right): llength = len(left) labels = np.concatenate([left, right]) - _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) new_labels = ensure_int64(new_labels) new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right -def _get_join_keys(llab, rlab, shape, sort): +def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow pred = lambda i: not is_int64_overflow_possible(shape[:i]) @@ -1949,20 +1984,20 @@ def _get_join_keys(llab, rlab, shape, sort): return _get_join_keys(llab, rlab, shape, sort) -def _should_fill(lname, rname): +def _should_fill(lname, rname) -> bool: if not isinstance(lname, str) or not isinstance(rname, str): return True return lname == rname -def _any(x): +def _any(x) -> bool: return x is not None and com.any_not_none(*x) -def validate_operand(obj): - if isinstance(obj, DataFrame): +def _validate_operand(obj: FrameOrSeries) -> "DataFrame": + if isinstance(obj, ABCDataFrame): return obj - elif isinstance(obj, Series): + elif isinstance(obj, ABCSeries): if obj.name is None: raise ValueError("Cannot merge a Series without a name") else: @@ -1974,7 +2009,7 @@ def validate_operand(obj): ) -def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): +def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): """ If two indices overlap, add suffixes to overlapping entries. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index d653dd87308cf..b443ba142369c 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Union + import numpy as np from pandas.util._decorators import Appender, Substitution @@ -9,11 +11,14 @@ import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper -from pandas.core.index import Index, MultiIndex, _get_objs_combined_axis +from pandas.core.indexes.api import Index, MultiIndex, get_objs_combined_axis from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series +if TYPE_CHECKING: + from pandas import DataFrame + # Note: We need to make sure `frame` is imported before `pivot`, otherwise # _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency @@ -30,12 +35,12 @@ def pivot_table( dropna=True, margins_name="All", observed=False, -): +) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): - pieces = [] + pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( @@ -143,7 +148,7 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast="infer") + table = table._ensure_type(table.fillna(fill_value, downcast="infer")) if margins: if dropna: @@ -180,14 +185,14 @@ def pivot_table( def _add_margins( - table, + table: Union["Series", "DataFrame"], data, values, rows, cols, aggfunc, observed=None, - margins_name="All", + margins_name: str = "All", fill_value=None, ): if not isinstance(margins_name, str): @@ -200,12 +205,13 @@ def _add_margins( grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) - # could be passed a Series object with no 'columns' - if hasattr(table, "columns"): + if table.ndim == 2: + # i.e. DataFramae for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) + key: Union[str, Tuple[str, ...]] if len(rows) > 1: key = (margins_name,) + ("",) * (len(rows) - 1) else: @@ -216,7 +222,7 @@ def _add_margins( # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) - if values: + elif values: marginal_result_set = _generate_marginal_results( table, data, @@ -232,12 +238,15 @@ def _add_margins( return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: + # no values, and table is a DataFrame + assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set + row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: @@ -252,9 +261,12 @@ def _add_margins( row_names = result.index.names try: + # check the result column and leave floats for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].astype(dtype) + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) result = result.append(margin_dummy) except TypeError: @@ -266,7 +278,7 @@ def _add_margins( return result -def _compute_grand_margin(data, values, aggfunc, margins_name="All"): +def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): if values: grand_margin = {} @@ -289,7 +301,15 @@ def _compute_grand_margin(data, values, aggfunc, margins_name="All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All" + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins @@ -353,7 +373,7 @@ def _all_key(key): def _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name="All" + table: "DataFrame", data, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins @@ -406,7 +426,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data, index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if values is None: cols = [columns] if index is None else [index, columns] append = index is None @@ -436,10 +456,10 @@ def crosstab( colnames=None, aggfunc=None, margins=False, - margins_name="All", - dropna=True, + margins_name: str = "All", + dropna: bool = True, normalize=False, -): +) -> "DataFrame": """ Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -541,9 +561,12 @@ def crosstab( rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + common_idx = None + pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] + if pass_objs: + common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data = {} + data: Dict = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) @@ -570,7 +593,7 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, - **kwargs + **kwargs, ) # Post-process @@ -582,7 +605,7 @@ def crosstab( return table -def _normalize(table, normalize, margins, margins_name="All"): +def _normalize(table, normalize, margins: bool, margins_name="All"): if not isinstance(normalize, (bool, str)): axis_subs = {0: "index", 1: "columns"} @@ -594,7 +617,7 @@ def _normalize(table, normalize, margins, margins_name="All"): if margins is False: # Actual Normalizations - normalizers = { + normalizers: Dict[Union[bool, str], Callable] = { "all": lambda x: x / x.sum(axis=1).sum(axis=0), "columns": lambda x: x / x.sum(), "index": lambda x: x.div(x.sum(axis=1), axis=0), @@ -620,7 +643,9 @@ def _normalize(table, normalize, margins, margins_name="All"): if (margins_name not in table.iloc[-1, :].name) | ( margins_name != table.iloc[:, -1].name ): - raise ValueError("{} not in pivoted DataFrame".format(margins_name)) + raise ValueError( + "{mname} not in pivoted DataFrame".format(mname=margins_name) + ) column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] @@ -663,7 +688,7 @@ def _normalize(table, normalize, margins, margins_name="All"): return table -def _get_names(arrs, names, prefix="row"): +def _get_names(arrs, names, prefix: str = "row"): if names is None: names = [] for i, arr in enumerate(arrs): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ad7081fb17703..97f416e32d07b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,5 +1,6 @@ from functools import partial import itertools +from typing import List import numpy as np @@ -22,10 +23,10 @@ import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray -from pandas.core.arrays.categorical import _factorize_from_iterable +from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import extract_array from pandas.core.frame import DataFrame -from pandas.core.index import Index, MultiIndex +from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -88,7 +89,7 @@ class _Unstacker: def __init__( self, - values, + values: np.ndarray, index, level=-1, value_columns=None, @@ -230,11 +231,9 @@ def get_new_values(self): if needs_i8_conversion(values): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") - name = "int64" elif is_bool_dtype(values): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") - name = "object" else: sorted_values = sorted_values.astype(name, copy=False) @@ -359,7 +358,7 @@ def _unstack_multiple(data, clocs, fill_value=None): result = data for i in range(len(clocs)): val = clocs[i] - result = result.unstack(val) + result = result.unstack(val, fill_value=fill_value) clocs = [v if i > v else v - 1 for v in clocs] return result @@ -504,7 +503,7 @@ def stack(frame, level=-1, dropna=True): def factorize(index): if index.is_unique: return index, np.arange(len(index)) - codes, categories = _factorize_from_iterable(index) + codes, categories = factorize_from_iterable(index) return categories, codes N, K = frame.shape @@ -725,7 +724,7 @@ def _convert_level_number(level_num, columns): new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: - old_codes, old_levels = _factorize_from_iterable(this.index) + old_codes, old_levels = factorize_from_iterable(this.index) new_levels = [old_levels] new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? @@ -757,7 +756,7 @@ def get_dummies( sparse=False, drop_first=False, dtype=None, -): +) -> "DataFrame": """ Convert categorical variable into dummy/indicator variables. @@ -901,7 +900,7 @@ def check_len(item, name): if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns - with_dummies = [] + with_dummies: List[DataFrame] = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. @@ -949,7 +948,7 @@ def _get_dummies_1d( from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling - codes, levels = _factorize_from_iterable(Series(data)) + codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 @@ -958,7 +957,7 @@ def _get_dummies_1d( if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") - def get_empty_frame(data): + def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: @@ -985,7 +984,7 @@ def get_empty_frame(data): else: # PY2 embedded unicode, gh-22084 - def _make_col_name(prefix, prefix_sep, level): + def _make_col_name(prefix, prefix_sep, level) -> str: fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) @@ -1046,43 +1045,7 @@ def _make_col_name(prefix, prefix_sep, level): return DataFrame(dummy_mat, index=index, columns=dummy_cols) -def make_axis_dummies(frame, axis="minor", transform=None): - """ - Construct 1-0 dummy variables corresponding to designated axis - labels - - Parameters - ---------- - frame : DataFrame - axis : {'major', 'minor'}, default 'minor' - transform : function, default None - Function to apply to axis labels first. For example, to - get "day of week" dummies in a time series regression - you might call:: - - make_axis_dummies(panel, axis='major', - transform=lambda d: d.weekday()) - Returns - ------- - dummies : DataFrame - Column names taken from chosen axis - """ - numbers = {"major": 0, "minor": 1} - num = numbers.get(axis, axis) - - items = frame.index.levels[num] - codes = frame.index.codes[num] - if transform is not None: - mapped_items = items.map(transform) - codes, items = _factorize_from_iterable(mapped_items.take(codes)) - - values = np.eye(len(items), dtype=float) - values = values.take(codes, axis=0) - - return DataFrame(values, columns=items, index=frame.index) - - -def _reorder_for_extension_array_stack(arr, n_rows, n_columns): +def _reorder_for_extension_array_stack(arr, n_rows: int, n_columns: int): """ Re-orders the values when stacking multiple extension-arrays. diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 2cc9f8927effb..2e3eb9170b15c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,8 +1,6 @@ """ Quantilization functions and related stuff """ -from functools import partial - import numpy as np from pandas._libs import Timedelta, Timestamp @@ -17,20 +15,14 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, + is_list_like, is_scalar, is_timedelta64_dtype, ) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna -from pandas import ( - Categorical, - Index, - Interval, - IntervalIndex, - Series, - to_datetime, - to_timedelta, -) +from pandas import Categorical, Index, IntervalIndex, to_datetime, to_timedelta import pandas.core.algorithms as algos import pandas.core.nanops as nanops @@ -38,12 +30,12 @@ def cut( x, bins, - right=True, + right: bool = True, labels=None, - retbins=False, - precision=3, - include_lowest=False, - duplicates="raise", + retbins: bool = False, + precision: int = 3, + include_lowest: bool = False, + duplicates: str = "raise", ): """ Bin values into discrete intervals. @@ -74,11 +66,12 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels : array or bool, optional + labels : array or False, default None Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). - This argument is ignored when `bins` is an IntervalIndex. + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -208,7 +201,8 @@ def cut( # NOTE: this binning code is changed a bit from histogram for var(x) == 0 # for handling the cut for datetime and timedelta objects - x_is_series, series_index, name, x = _preprocess_for_cut(x) + original = x + x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if not np.iterable(bins): @@ -270,33 +264,39 @@ def cut( duplicates=duplicates, ) - return _postprocess_for_cut( - fac, bins, retbins, x_is_series, series_index, name, dtype - ) + return _postprocess_for_cut(fac, bins, retbins, dtype, original) -def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): +def qcut( + x, + q, + labels=None, + retbins: bool = False, + precision: int = 3, + duplicates: str = "raise", +): """ - Quantile-based discretization function. Discretize variable into - equal-sized buckets based on rank or based on sample quantiles. For example - 1000 values for 10 quantiles would produce a Categorical object indicating - quantile membership for each data point. + Quantile-based discretization function. + + Discretize variable into equal-sized buckets based on rank or based + on sample quantiles. For example 1000 values for 10 quantiles would + produce a Categorical object indicating quantile membership for each data point. Parameters ---------- x : 1d ndarray or Series q : int or list-like of int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately - array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles - labels : array or bool, default None + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. + labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the - bins. + bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. precision : int, optional - The precision at which to store and display the bins labels + The precision at which to store and display the bins labels. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. @@ -328,8 +328,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3]) """ - x_is_series, series_index, name, x = _preprocess_for_cut(x) - + original = x + x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if is_integer(q): @@ -347,20 +347,18 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): duplicates=duplicates, ) - return _postprocess_for_cut( - fac, bins, retbins, x_is_series, series_index, name, dtype - ) + return _postprocess_for_cut(fac, bins, retbins, dtype, original) def _bins_to_cuts( x, bins, - right=True, + right: bool = True, labels=None, - precision=3, - include_lowest=False, + precision: int = 3, + include_lowest: bool = False, dtype=None, - duplicates="raise", + duplicates: str = "raise", ): if duplicates not in ["raise", "drop"]: @@ -379,9 +377,8 @@ def _bins_to_cuts( if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( - "Bin edges must be unique: {bins!r}.\nYou " - "can drop duplicate edges by setting " - "the 'duplicates' kwarg".format(bins=bins) + f"Bin edges must be unique: {repr(bins)}.\n" + f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins @@ -396,15 +393,23 @@ def _bins_to_cuts( has_nas = na_mask.any() if labels is not False: - if labels is None: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) + else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) + if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) @@ -498,13 +503,15 @@ def _convert_bin_to_datelike_type(bins, dtype): return bins -def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): +def _format_labels( + bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None +): """ based on the dtype, return our labels """ closed = "right" if right else "left" if is_datetime64tz_dtype(dtype): - formatter = partial(Timestamp, tz=dtype.tz) + formatter = lambda x: Timestamp(x, tz=dtype.tz) adjust = lambda x: x - Timedelta("1ns") elif is_datetime64_dtype(dtype): formatter = Timestamp @@ -518,17 +525,11 @@ def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None adjust = lambda x: x - 10 ** (-precision) breaks = [formatter(b) for b in bins] - labels = IntervalIndex.from_breaks(breaks, closed=closed) - if right and include_lowest: - # we will adjust the left hand side by precision to - # account that we are all right closed - v = adjust(labels[0].left) + # adjust lhs of first interval by precision to account for being right closed + breaks[0] = adjust(breaks[0]) - i = IntervalIndex([Interval(v, labels[0].right, closed="right")]) - labels = i.append(labels[1:]) - - return labels + return IntervalIndex.from_breaks(breaks, closed=closed) def _preprocess_for_cut(x): @@ -537,13 +538,6 @@ def _preprocess_for_cut(x): input to array, strip the index information and store it separately """ - x_is_series = isinstance(x, Series) - series_index = None - name = None - - if x_is_series: - series_index = x.index - name = x.name # Check that the passed array is a Pandas or Numpy object # We don't want to strip away a Pandas data-type here (e.g. datetimetz) @@ -553,17 +547,17 @@ def _preprocess_for_cut(x): if x.ndim != 1: raise ValueError("Input array must be 1 dimensional") - return x_is_series, series_index, name, x + return x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dtype): +def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original): """ handles post processing for the cut method where we combine the index information if the originally passed datatype was a series """ - if x_is_series: - fac = Series(fac, index=series_index, name=name) + if isinstance(original, ABCSeries): + fac = original._constructor(fac, index=original.index, name=original.name) if not retbins: return fac @@ -573,7 +567,7 @@ def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dt return fac, bins -def _round_frac(x, precision): +def _round_frac(x, precision: int): """ Round the fractional part of the given number """ @@ -588,7 +582,7 @@ def _round_frac(x, precision): return np.around(x, digits) -def _infer_precision(base_precision, bins): +def _infer_precision(base_precision: int, bins) -> int: """Infer an appropriate precision for _round_frac """ for precision in range(base_precision, 20): diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 044e058904dce..d8652c9b4fac9 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -2,7 +2,7 @@ from pandas.core.dtypes.common import is_list_like -from pandas.core import common as com +import pandas.core.common as com def cartesian_product(X): diff --git a/pandas/core/series.py b/pandas/core/series.py index 3e9d3d5c04559..3e1f011fde51a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1,11 +1,10 @@ """ Data structure for 1-dimensional cross-sectional and time series data """ -from collections import OrderedDict from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import Any, Callable, Hashable, List, Optional +from typing import IO, Any, Callable, Hashable, List, Optional import warnings import numpy as np @@ -13,33 +12,27 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs -from pandas.compat import PY36 from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, deprecate +from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, is_bool, - is_categorical, is_categorical_dtype, is_datetime64_dtype, - is_datetimelike, is_dict_like, is_extension_array_dtype, - is_extension_type, is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, - is_string_like, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeArray, ABCDatetimeIndex, ABCSeries, ABCSparseArray, @@ -55,20 +48,26 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, try_cast_to_ea from pandas.core.arrays.categorical import Categorical, CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com -from pandas.core.construction import extract_array, sanitize_array -from pandas.core.index import ( +from pandas.core.construction import ( + create_series_with_explicit_dtype, + extract_array, + is_empty_data, + sanitize_array, +) +from pandas.core.groupby import generic as groupby_generic +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.accessors import CombinedDatetimelikeProperties +from pandas.core.indexes.api import ( Float64Index, Index, InvalidIndexError, MultiIndex, ensure_index, ) -from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.accessors import CombinedDatetimelikeProperties import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex @@ -101,23 +100,6 @@ ) -# see gh-16971 -def remove_na(arr): - """ - Remove null values from array like structure. - - .. deprecated:: 0.21.0 - Use s[s.notnull()] instead. - """ - - warnings.warn( - "remove_na is deprecated and is a private function. Do not use.", - FutureWarning, - stacklevel=2, - ) - return remove_na_arraylike(arr) - - def _coerce_method(converter): """ Install the scalar coercion methods. @@ -126,9 +108,9 @@ def _coerce_method(converter): def wrapper(self): if len(self) == 1: return converter(self.iloc[0]) - raise TypeError("cannot convert the series to {0}".format(str(converter))) + raise TypeError(f"cannot convert the series to {converter}") - wrapper.__name__ = "__{name}__".format(name=converter.__name__) + wrapper.__name__ = f"__{converter.__name__}__" return wrapper @@ -169,35 +151,30 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Data type for the output Series. If not specified, this will be inferred from `data`. See the :ref:`user guide ` for more usages. + name : str, optional + The name to give to the Series. copy : bool, default False Copy input data. """ - _metadata = [] # type: List[str] + _typ = "series" + + _name: Optional[Hashable] + _metadata: List[str] = ["name"] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations | generic.NDFrame._deprecations - | frozenset( - [ - "asobject", - "compress", - "valid", - "ftype", - "real", - "imag", - "put", - "ptp", - "nonzero", - ] - ) + | frozenset(["compress", "ptp"]) ) # Override cache_readonly bc Series is mutable hasnans = property( base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) - _data = None # type: SingleBlockManager + _data: SingleBlockManager + div: Callable[["Series", Any], "Series"] + rdiv: Callable[["Series", Any], "Series"] # ---------------------------------------------------------------------- # Constructors @@ -219,21 +196,26 @@ def __init__( else: + name = ibase.maybe_extract_name(name, data, type(self)) + + if is_empty_data(data) and dtype is None: + # gh-17261 + warnings.warn( + "The default dtype for empty Series will be 'object' instead " + "of 'float64' in a future version. Specify a dtype explicitly " + "to silence this warning.", + DeprecationWarning, + stacklevel=2, + ) + # uncomment the line below when removing the DeprecationWarning + # dtype = np.dtype(object) + if index is not None: index = ensure_index(index) if data is None: data = {} if dtype is not None: - # GH 26336: explicitly handle 'category' to avoid warning - # TODO: Remove after CategoricalDtype defaults to ordered=False - if ( - isinstance(dtype, str) - and dtype == "category" - and is_categorical(data) - ): - dtype = data.dtype - dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): @@ -241,8 +223,6 @@ def __init__( "initializing a Series from a MultiIndex is not supported" ) elif isinstance(data, Index): - if name is None: - name = data.name if dtype is not None: # astype copies @@ -257,16 +237,21 @@ def __init__( copy = False elif isinstance(data, np.ndarray): + if len(data.dtype): + # GH#13296 we are dealing with a compound dtype, which + # should be treated as 2D + raise ValueError( + "Cannot construct a Series from an ndarray with " + "compound dtype. Use DataFrame instead." + ) pass elif isinstance(data, ABCSeries): - if name is None: - name = data.name if index is None: index = data.index else: data = data.reindex(index, copy=copy) data = data._data - elif isinstance(data, dict): + elif is_dict_like(data): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -279,16 +264,13 @@ def __init__( raise AssertionError( "Cannot pass both SingleBlockManager " "`data` argument and a different " - "`index` argument. `copy` must " - "be False." + "`index` argument. `copy` must be False." ) elif is_extension_array_dtype(data): pass elif isinstance(data, (set, frozenset)): - raise TypeError( - "{0!r} type is unordered".format(data.__class__.__name__) - ) + raise TypeError(f"'{type(data).__name__}' type is unordered") elif isinstance(data, ABCSparseArray): # handle sparse passed here (and force conversion) data = data.to_dense() @@ -306,8 +288,8 @@ def __init__( try: if len(index) != len(data): raise ValueError( - "Length of passed values is {val}, " - "index implies {ind}".format(val=len(data), ind=len(index)) + f"Length of passed values is {len(data)}, " + f"index implies {len(index)}." ) except TypeError: pass @@ -360,46 +342,17 @@ def _init_dict(self, data, index=None, dtype=None): keys, values = [], [] # Input is now list-like, so rely on "standard" construction: - s = Series(values, index=keys, dtype=dtype) + + # TODO: passing np.float64 to not break anything yet. See GH-17261 + s = create_series_with_explicit_dtype( + values, index=keys, dtype=dtype, dtype_if_empty=np.float64 + ) # Now we just make sure the order is respected, if any if data and index is not None: s = s.reindex(index, copy=False) - elif not PY36 and not isinstance(data, OrderedDict) and data: - # Need the `and data` to avoid sorting Series(None, index=[...]) - # since that isn't really dict-like - try: - s = s.sort_index() - except TypeError: - pass return s._data, s.index - @classmethod - def from_array( - cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False - ): - """ - Construct Series from array. - - .. deprecated:: 0.23.0 - Use pd.Series(..) constructor instead. - - Returns - ------- - Series - Constructed Series. - """ - warnings.warn( - "'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.Series(..) " - "constructor instead.", - FutureWarning, - stacklevel=2, - ) - return cls( - arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath - ) - # ---------------------------------------------------------------------- @property @@ -473,49 +426,13 @@ def dtypes(self): @property def name(self) -> Optional[Hashable]: - return self.attrs.get("name", None) + return self._name @name.setter def name(self, value: Optional[Hashable]) -> None: if not is_hashable(value): raise TypeError("Series.name must be a hashable type") - self.attrs["name"] = value - - @property - def ftype(self): - """ - Return if the data is sparse|dense. - - .. deprecated:: 0.25.0 - Use :func:`dtype` instead. - """ - warnings.warn( - "Series.ftype is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, - stacklevel=2, - ) - - return self._data.ftype - - @property - def ftypes(self): - """ - Return if the data is sparse|dense. - - .. deprecated:: 0.25.0 - Use :func:`dtypes` instead. - """ - warnings.warn( - "Series.ftypes is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, - stacklevel=2, - ) - - return self._data.ftype + object.__setattr__(self, "_name", value) @property def values(self): @@ -566,47 +483,18 @@ def _values(self): """ return self._data.internal_values() - def get_values(self): + def _internal_get_values(self): """ Same as values (but handles sparseness conversions); is a view. - .. deprecated:: 0.25.0 - Use :meth:`Series.to_numpy` or :attr:`Series.array` instead. - Returns ------- numpy.ndarray Data of the Series. """ - warnings.warn( - "The 'get_values' method is deprecated and will be removed in a " - "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, - stacklevel=2, - ) - return self._internal_get_values() - def _internal_get_values(self): return self._data.get_values() - @property - def asobject(self): - """ - Return object Series which contains boxed values. - - .. deprecated:: 0.23.0 - - Use ``astype(object)`` instead. - - *this is an internal non-public method* - """ - warnings.warn( - "'asobject' is deprecated. Use 'astype(object)' instead", - FutureWarning, - stacklevel=2, - ) - return self.astype(object).values - # ops def ravel(self, order="C"): """ @@ -623,97 +511,7 @@ def ravel(self, order="C"): """ return self._values.ravel(order=order) - def compress(self, condition, *args, **kwargs): - """ - Return selected slices of an array along given axis as a Series. - - .. deprecated:: 0.24.0 - - Returns - ------- - Series - Series without the slices for which condition is false. - - See Also - -------- - numpy.ndarray.compress - """ - msg = ( - "Series.compress(condition) is deprecated. " - "Use 'Series[condition]' or " - "'np.asarray(series).compress(condition)' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - nv.validate_compress(args, kwargs) - return self[condition] - - def nonzero(self): - """ - Return the *integer* indices of the elements that are non-zero. - - .. deprecated:: 0.24.0 - Please use .to_numpy().nonzero() as a replacement. - - This method is equivalent to calling `numpy.nonzero` on the - series data. For compatibility with NumPy, the return value is - the same (a tuple with an array of indices for each dimension), - but it will always be a one-item tuple because series only have - one dimension. - - Returns - ------- - numpy.ndarray - Indices of elements that are non-zero. - - See Also - -------- - numpy.nonzero - - Examples - -------- - >>> s = pd.Series([0, 3, 0, 4]) - >>> s.nonzero() - (array([1, 3]),) - >>> s.iloc[s.nonzero()[0]] - 1 3 - 3 4 - dtype: int64 - - # same return although index of s is different - >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd']) - >>> s.nonzero() - (array([1, 3]),) - >>> s.iloc[s.nonzero()[0]] - b 3 - d 4 - dtype: int64 - """ - msg = ( - "Series.nonzero() is deprecated " - "and will be removed in a future version." - "Use Series.to_numpy().nonzero() instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - return self._values.nonzero() - - def put(self, *args, **kwargs): - """ - Apply the `put` method to its `values` attribute if it has one. - - .. deprecated:: 0.25.0 - - See Also - -------- - numpy.ndarray.put - """ - warnings.warn( - "`put` has been deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - self._values.put(*args, **kwargs) - - def __len__(self): + def __len__(self) -> int: """ Return the length of the Series. """ @@ -845,9 +643,10 @@ def __array_ufunc__( inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) + + name: Optional[Hashable] if len(set(names)) == 1: - # we require names to be hashable, right? - name = names[0] # type: Any + name = names[0] else: name = None @@ -857,14 +656,8 @@ def construct_return(result): elif result.ndim > 1: # e.g. np.subtract.outer if method == "outer": - msg = ( - "outer method for ufunc {} is not implemented on " - "pandas objects. Returning an ndarray, but in the " - "future this will raise a 'NotImplementedError'. " - "Consider explicitly converting the Series " - "to an array with '.array' first." - ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3) + # GH#27198 + raise NotImplementedError return result return self._constructor(result, index=index, name=name, copy=False) @@ -877,7 +670,7 @@ def construct_return(result): else: return construct_return(result) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -917,73 +710,18 @@ def __array__(self, dtype=None): Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], dtype=object) - Or the values may be localized to UTC and the tzinfo discared with + Or the values may be localized to UTC and the tzinfo discarded with ``dtype='datetime64[ns]'`` >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - if ( - dtype is None - and isinstance(self.array, ABCDatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- # Unary Methods - @property - def real(self): - """ - Return the real value of vector. - - .. deprecated:: 0.25.0 - """ - warnings.warn( - "`real` is deprecated and will be removed in a future version. " - "To eliminate this warning for a Series `ser`, use " - "`np.real(ser.to_numpy())` or `ser.to_numpy().real`.", - FutureWarning, - stacklevel=2, - ) - return self.values.real - - @real.setter - def real(self, v): - self.values.real = v - - @property - def imag(self): - """ - Return imag value of vector. - - .. deprecated:: 0.25.0 - """ - warnings.warn( - "`imag` is deprecated and will be removed in a future version. " - "To eliminate this warning for a Series `ser`, use " - "`np.imag(ser.to_numpy())` or `ser.to_numpy().imag`.", - FutureWarning, - stacklevel=2, - ) - return self.values.imag - - @imag.setter - def imag(self, v): - self.values.imag = v - # coercion __float__ = _coerce_method(float) __long__ = _coerce_method(int) @@ -1018,7 +756,7 @@ def _unpickle_series_compat(self, state): self.name = name else: - raise Exception("cannot unpickle legacy formats -> [%s]" % state) + raise Exception(f"cannot unpickle legacy formats -> [{state}]") # indexers @property @@ -1313,7 +1051,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError("%s not contained in the index" % str(key[mask])) + raise ValueError(f"{key[mask]} not contained in the index") self._set_values(indexer, value) def _set_values(self, key, value): @@ -1556,7 +1294,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): # ---------------------------------------------------------------------- # Rendering Methods - def __repr__(self): + def __repr__(self) -> str: """ Return a string representation for a particular Series. """ @@ -1651,9 +1389,8 @@ def to_string( # catch contract violations if not isinstance(result, str): raise AssertionError( - "result must be of type unicode, type" - " of result is {0!r}" - "".format(result.__class__.__name__) + "result must be of type str, type" + f" of result is {repr(type(result).__name__)}" ) if buf is None: @@ -1665,6 +1402,27 @@ def to_string( with open(buf, "w") as f: f.write(result) + @Appender( + """ + Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + """ + ) + @Substitution(klass="Series") + @Appender(generic._shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + return self.to_frame().to_markdown(buf, mode, **kwargs) + # ---------------------------------------------------------------------- def items(self): @@ -1689,7 +1447,7 @@ def items(self): -------- >>> s = pd.Series(['A', 'B', 'C']) >>> for index, value in s.items(): - ... print("Index : {}, Value : {}".format(index, value)) + ... print(f"Index : {index}, Value : {value}") Index : 0, Value : A Index : 1, Value : B Index : 2, Value : C @@ -1796,6 +1554,89 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods @@ -2096,8 +1937,8 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): Exclude NA/null values. If the entire Series is NA, the result will be NA. *args, **kwargs - Additional keywords have no effect but might be accepted - for compatibility with NumPy. + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. Returns ------- @@ -2166,8 +2007,8 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): Exclude NA/null values. If the entire Series is NA, the result will be NA. *args, **kwargs - Additional keywords have no effect but might be accepted - for compatibility with NumPy. + Additional arguments and keywords have no effect but might be + accepted for compatibility with NumPy. Returns ------- @@ -2221,36 +2062,6 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): return np.nan return self.index[i] - # ndarray compat - argmin = deprecate( - "argmin", - idxmin, - "0.21.0", - msg=dedent( - """ - The current behaviour of 'Series.argmin' is deprecated, use 'idxmin' - instead. - The behavior of 'argmin' will be corrected to return the positional - minimum in the future. For now, use 'series.values.argmin' or - 'np.argmin(np.array(values))' to get the position of the minimum - row.""" - ), - ) - argmax = deprecate( - "argmax", - idxmax, - "0.21.0", - msg=dedent( - """ - The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' - instead. - The behavior of 'argmax' will be corrected to return the positional - maximum in the future. For now, use 'series.values.argmax' or - 'np.argmax(np.array(values))' to get the position of the maximum - row.""" - ), - ) - def round(self, decimals=0, *args, **kwargs): """ Round each value in a Series to the given number of decimals. @@ -2396,7 +2207,7 @@ def corr(self, other, method="pearson", min_periods=None): raise ValueError( "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method) + f"'{method}' was supplied" ) def cov(self, other, min_periods=None): @@ -2601,7 +2412,7 @@ def dot(self, other): rvals = np.asarray(other) if lvals.shape[0] != rvals.shape[0]: raise Exception( - "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape) + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" ) if isinstance(other, ABCDataFrame): @@ -2613,7 +2424,7 @@ def dot(self, other): elif isinstance(rvals, np.ndarray): return np.dot(lvals, rvals) else: # pragma: no cover - raise TypeError("unsupported type: %s" % type(other)) + raise TypeError(f"unsupported type: {type(other)}") def __matmul__(self, other): """ @@ -2849,14 +2660,7 @@ def combine(self, other, func, fill_value=None): elif is_extension_array_dtype(self.values): # The function can return something of any type, so check # if the type is compatible with the calling EA. - try: - new_values = self._values._from_sequence(new_values) - except Exception: - # https://github.com/pandas-dev/pandas/issues/22850 - # pandas has no control over what 3rd-party ExtensionArrays - # do in _values_from_sequence. We still want ops to work - # though, so we catch any regular Exception. - pass + new_values = try_cast_to_ea(self._values, new_values) return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other): @@ -2894,7 +2698,7 @@ def combine_first(self, other): new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) other = other.reindex(new_index, copy=False) - if is_datetimelike(this) and not is_datetimelike(other): + if this.dtype.kind == "M" and other.dtype.kind != "M": other = to_datetime(other) return this.where(notna(this), other) @@ -2961,6 +2765,7 @@ def sort_values( inplace=False, kind="quicksort", na_position="last", + ignore_index=False, ): """ Sort by the values. @@ -2983,6 +2788,10 @@ def sort_values( na_position : {'first' or 'last'}, default 'last' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -3088,7 +2897,7 @@ def _try_kind_sort(arr): return arr.argsort(kind="quicksort") arr = self._values - sortedIdx = np.empty(len(self), dtype=np.int32) + sorted_index = np.empty(len(self), dtype=np.int32) bad = isna(arr) @@ -3100,8 +2909,7 @@ def _try_kind_sort(arr): if is_list_like(ascending): if len(ascending) != 1: raise ValueError( - "Length of ascending (%d) must be 1 " - "for Series" % (len(ascending)) + f"Length of ascending ({len(ascending)}) must be 1 for Series" ) ascending = ascending[0] @@ -3113,16 +2921,19 @@ def _try_kind_sort(arr): if na_position == "last": n = good.sum() - sortedIdx[:n] = idx[good][argsorted] - sortedIdx[n:] = idx[bad] + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] elif na_position == "first": n = bad.sum() - sortedIdx[n:] = idx[good][argsorted] - sortedIdx[:n] = idx[bad] + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] else: - raise ValueError("invalid na_position: {!r}".format(na_position)) + raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx]) + result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + + if ignore_index: + result.index = ibase.default_index(len(sorted_index)) if inplace: self._update_inplace(result) @@ -3138,6 +2949,7 @@ def sort_index( kind="quicksort", na_position="last", sort_remaining=True, + ignore_index: bool = False, ): """ Sort Series by index labels. @@ -3166,6 +2978,10 @@ def sort_index( sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -3293,6 +3109,9 @@ def sort_index( new_values = self._values.take(indexer) result = self._constructor(new_values, index=new_index) + if ignore_index: + result.index = ibase.default_index(len(result)) + if inplace: self._update_inplace(result) else: @@ -3305,7 +3124,7 @@ def argsort(self, axis=0, kind="quicksort", order=None): Parameters ---------- - axis : int + axis : {0 or "index"} Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See np.sort for more @@ -3539,8 +3358,8 @@ def swaplevel(self, i=-2, j=-1, copy=True): Parameters ---------- - i, j : int, str (can be mixed) - Level of index to be swapped. Can pass level name as string. + i, j : int, str + Level of the indices to be swapped. Can pass level name as string. copy : bool, default True Whether to copy underlying data. @@ -3688,7 +3507,7 @@ def map(self, arg, na_action=None): Parameters ---------- - arg : function, dict, or Series + arg : function, collections.abc.Mapping subclass or Series Mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the @@ -3805,7 +3624,7 @@ def _gotitem(self, key, ndim, subset=None): see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs + **_shared_doc_kwargs, ) @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): @@ -3966,7 +3785,8 @@ def f(x): return f(self) # row-wise access - if is_extension_type(self.dtype): + if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): + # GH#23179 some EAs do not have `map` mapped = self._values.map(f) else: values = self.astype(object).values @@ -3994,9 +3814,7 @@ def _reduce( self._get_axis_number(axis) if isinstance(delegate, Categorical): - # TODO deprecate numeric_only argument for Categorical and use - # skipna as well, see GH25303 - return delegate._reduce(name, numeric_only=numeric_only, **kwds) + return delegate._reduce(name, skipna=skipna, **kwds) elif isinstance(delegate, ExtensionArray): # dispatch to ExtensionArray interface return delegate._reduce(name, skipna=skipna, **kwds) @@ -4012,7 +3830,7 @@ def _reduce( elif isinstance(delegate, np.ndarray): if numeric_only: raise NotImplementedError( - "Series.{0} does not implement numeric_only.".format(name) + f"Series.{name} does not implement numeric_only." ) with np.errstate(all="ignore"): return op(delegate, skipna=skipna, **kwds) @@ -4027,7 +3845,7 @@ def _reduce( skipna=skipna, numeric_only=numeric_only, filter_type=filter_type, - **kwds + **kwds, ) def _reindex_indexer(self, new_index, indexer, copy): @@ -4075,7 +3893,16 @@ def align( broadcast_axis=broadcast_axis, ) - def rename(self, index=None, **kwargs): + def rename( + self, + index=None, + *, + axis=None, + copy=True, + inplace=False, + level=None, + errors="ignore", + ): """ Alter Series index labels or name. @@ -4089,19 +3916,17 @@ def rename(self, index=None, **kwargs): Parameters ---------- + axis : {0 or "index"} + Unused. Accepted for compatability with DataFrame method only. index : scalar, hashable sequence, dict-like or function, optional Functions or dict-like are transformations to apply to the index. Scalar or hashable sequence-like will alter the ``Series.name`` attribute. - copy : bool, default True - Whether to copy underlying data. - inplace : bool, default False - Whether to return a new Series. If True then value of copy is - ignored. - level : int or level name, default None - In case of a MultiIndex, only rename labels in the specified - level. + + **kwargs + Additional keyword arguments passed to the function. Only the + "inplace" keyword is used. Returns ------- @@ -4110,6 +3935,7 @@ def rename(self, index=None, **kwargs): See Also -------- + DataFrame.rename : Corresponding DataFrame method. Series.rename_axis : Set the name of the axis. Examples @@ -4136,12 +3962,12 @@ def rename(self, index=None, **kwargs): 5 3 dtype: int64 """ - kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - if callable(index) or is_dict_like(index): - return super().rename(index=index, **kwargs) + return super().rename( + index, copy=copy, inplace=inplace, level=level, errors=errors + ) else: - return self._set_name(index, inplace=kwargs.get("inplace")) + return self._set_name(index, inplace=inplace) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) @@ -4268,8 +4094,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs - ): + ) -> Optional["Series"]: return super().fillna( value=value, method=method, @@ -4277,7 +4102,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) @@ -4490,101 +4314,6 @@ def between(self, left, right, inclusive=True): return lmask & rmask - @Appender(generic.NDFrame.to_csv.__doc__) - def to_csv(self, *args, **kwargs): - - names = [ - "path_or_buf", - "sep", - "na_rep", - "float_format", - "columns", - "header", - "index", - "index_label", - "mode", - "encoding", - "compression", - "quoting", - "quotechar", - "line_terminator", - "chunksize", - "date_format", - "doublequote", - "escapechar", - "decimal", - ] - - old_names = [ - "path_or_buf", - "index", - "sep", - "na_rep", - "float_format", - "header", - "index_label", - "mode", - "encoding", - "compression", - "date_format", - "decimal", - ] - - if "path" in kwargs: - warnings.warn( - "The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'path' will be renamed to 'path_or_buf'.", - FutureWarning, - stacklevel=2, - ) - kwargs["path_or_buf"] = kwargs.pop("path") - - if len(args) > 1: - # Either "index" (old signature) or "sep" (new signature) is being - # passed as second argument (while the first is the same) - maybe_sep = args[1] - - if not (is_string_like(maybe_sep) and len(maybe_sep) == 1): - # old signature - warnings.warn( - "The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`. Note that the " - "order of arguments changed, and the new one " - "has 'sep' in first place, for which \"{}\" is " - "not a valid value. The old order will cease to " - "be supported in a future version. Please refer " - "to the documentation for `DataFrame.to_csv` " - "when updating your function " - "calls.".format(maybe_sep), - FutureWarning, - stacklevel=2, - ) - names = old_names - - pos_args = dict(zip(names[: len(args)], args)) - - for key in pos_args: - if key in kwargs: - raise ValueError( - "Argument given by name ('{}') and position " - "({})".format(key, names.index(key)) - ) - kwargs[key] = pos_args[key] - - if kwargs.get("header", None) is None: - warnings.warn( - "The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'header' will change its default value from False " - "to True: please pass an explicit value to suppress " - "this warning.", - FutureWarning, - stacklevel=2, - ) - kwargs["header"] = False # Backwards compatibility. - return self.to_frame().to_csv(**kwargs) - @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() @@ -4601,7 +4330,7 @@ def notna(self): def notnull(self): return super().notnull() - def dropna(self, axis=0, inplace=False, **kwargs): + def dropna(self, axis=0, inplace=False, how=None): """ Return a new Series with missing values removed. @@ -4614,8 +4343,8 @@ def dropna(self, axis=0, inplace=False, **kwargs): There is only one axis to drop values from. inplace : bool, default False If True, do operation inplace and return None. - **kwargs - Not in use. + how : str, optional + Not in use. Kept for compatibility. Returns ------- @@ -4673,12 +4402,6 @@ def dropna(self, axis=0, inplace=False, **kwargs): dtype: object """ inplace = validate_bool_kwarg(inplace, "inplace") - kwargs.pop("how", None) - if kwargs: - raise TypeError( - "dropna() got an unexpected keyword " - 'argument "{0}"'.format(list(kwargs.keys())[0]) - ) # Validate the axis parameter self._get_axis_number(axis or 0) @@ -4695,26 +4418,6 @@ def dropna(self, axis=0, inplace=False, **kwargs): else: return self.copy() - def valid(self, inplace=False, **kwargs): - """ - Return Series without null values. - - .. deprecated:: 0.23.0 - Use :meth:`Series.dropna` instead. - - Returns - ------- - Series - Series without null values. - """ - warnings.warn( - "Method .valid will be removed in a future version. " - "Use .dropna instead.", - FutureWarning, - stacklevel=2, - ) - return self.dropna(inplace=inplace, **kwargs) - # ---------------------------------------------------------------------- # Time series-oriented methods @@ -4781,15 +4484,8 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes( - ["index"], - info_axis=0, - stat_axis=0, - aliases={"rows": 0}, - docs={"index": "The index (axis labels) of the Series."}, -) +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) Series._add_numeric_operations() -Series._add_series_only_operations() Series._add_series_or_dataframe_operations() # Add arithmetic! diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 706f6159bcafe..51c154aa47518 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -4,13 +4,11 @@ from pandas._libs import algos, hashtable, lib from pandas._libs.hashtable import unique_label_indices -from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, is_extension_array_dtype, - is_list_like, ) from pandas.core.dtypes.missing import isna @@ -20,7 +18,7 @@ _INT64_MAX = np.iinfo(np.int64).max -def get_group_index(labels, shape, sort, xnull): +def get_group_index(labels, shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label @@ -33,22 +31,27 @@ def get_group_index(labels, shape, sort, xnull): Parameters ---------- - labels: sequence of arrays + labels : sequence of arrays Integers identifying levels at each location - shape: sequence of ints same length as labels + shape : sequence of ints Number of unique levels at each location - sort: boolean + sort : bool If the ranks of returned ids should match lexical ranks of labels - xnull: boolean + xnull : bool If true nulls are excluded. i.e. -1 values in the labels are - passed through + passed through. + Returns ------- An array of type int64 where two elements are equal if their corresponding labels are equal at all location. + + Notes + ----- + The length of `labels` and `shape` must be identical. """ - def _int64_cut_off(shape): + def _int64_cut_off(shape) -> int: acc = 1 for i, mul in enumerate(shape): acc *= int(mul) @@ -106,7 +109,6 @@ def maybe_lift(lab, size): def get_compressed_ids(labels, sizes): """ - Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). @@ -119,13 +121,12 @@ def get_compressed_ids(labels, sizes): Returns ------- tuple of (comp_ids, obs_group_ids) - """ ids = get_group_index(labels, sizes, sort=True, xnull=False) return compress_group_index(ids, sort=True) -def is_int64_overflow_possible(shape): +def is_int64_overflow_possible(shape) -> bool: the_prod = 1 for x in shape: the_prod *= int(x) @@ -153,16 +154,15 @@ def decons_group_index(comp_labels, shape): return label_list[::-1] -def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull: bool): """ - reconstruct labels from observed group ids + Reconstruct labels from observed group ids. Parameters ---------- - xnull: boolean, - if nulls are excluded; i.e. -1 labels are passed through + xnull : bool + If nulls are excluded; i.e. -1 labels are passed through. """ - if not xnull: lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") shape = np.asarray(shape, dtype="i8") + lift @@ -177,7 +177,7 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): return [i8copy(lab[i]) for lab in labels] -def indexer_from_factorized(labels, shape, compress=True): +def indexer_from_factorized(labels, shape, compress: bool = True): ids = get_group_index(labels, shape, sort=True, xnull=False) if not compress: @@ -189,7 +189,12 @@ def indexer_from_factorized(labels, shape, compress=True): return get_group_index_sorter(ids, ngroups) -def lexsort_indexer(keys, orders=None, na_position="last"): +def lexsort_indexer(keys, orders=None, na_position: str = "last"): + """ + Parameters + ---------- + na_position : {'first', 'last'}, default 'last' + """ from pandas.core.arrays import Categorical labels = [] @@ -210,7 +215,7 @@ def lexsort_indexer(keys, orders=None, na_position="last"): cat = Categorical(key, ordered=True) if na_position not in ["last", "first"]: - raise ValueError("invalid na_position: {!r}".format(na_position)) + raise ValueError(f"invalid na_position: {na_position}") n = len(cat.categories) codes = cat.codes.copy() @@ -235,11 +240,21 @@ def lexsort_indexer(keys, orders=None, na_position="last"): return indexer_from_factorized(labels, shape) -def nargsort(items, kind="quicksort", ascending=True, na_position="last"): +def nargsort( + items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last" +): """ - This is intended to be a drop-in replacement for np.argsort which - handles NaNs. It adds ascending and na_position parameters. - GH #6399, #5231 + Intended to be a drop-in replacement for np.argsort which handles NaNs. + + Adds ascending and na_position parameters. + + (GH #6399, #5231) + + Parameters + ---------- + kind : str, default 'quicksort' + ascending : bool, default True + na_position : {'first', 'last'}, default 'last' """ items = extract_array(items) mask = np.asarray(isna(items)) @@ -266,16 +281,16 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): elif na_position == "first": indexer = np.concatenate([nan_idx, indexer]) else: - raise ValueError("invalid na_position: {!r}".format(na_position)) + raise ValueError(f"invalid na_position: {na_position}") return indexer class _KeyMapper: """ - Ease my suffering. Map compressed group id -> key tuple + Map compressed group id -> key tuple. """ - def __init__(self, comp_ids, ngroups, levels, labels): + def __init__(self, comp_ids, ngroups: int, levels, labels): self.levels = levels self.labels = labels self.comp_ids = comp_ids.astype(np.int64) @@ -303,7 +318,12 @@ def get_flattened_iterator(comp_ids, ngroups, levels, labels): def get_indexer_dict(label_list, keys): - """ return a dict of {labels} -> {indexers} """ + """ + Returns + ------- + dict + Labels mapped to indexers. + """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) @@ -325,7 +345,7 @@ def get_indexer_dict(label_list, keys): # sorting levels...cleverly? -def get_group_index_sorter(group_index, ngroups): +def get_group_index_sorter(group_index, ngroups: int): """ algos.groupsort_indexer implements `counting sort` and it is at least O(ngroups), where @@ -350,7 +370,7 @@ def get_group_index_sorter(group_index, ngroups): return group_index.argsort(kind="mergesort") -def compress_group_index(group_index, sort=True): +def compress_group_index(group_index, sort: bool = True): """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets @@ -389,126 +409,3 @@ def _reorder_by_uniques(uniques, labels): uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) return uniques, labels - - -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): - """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. - Safe for use with mixed types (int, str), orders ints before strs. - - Parameters - ---------- - values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like - Indices to ``values``. All out of bound indices are treated as - "not found" and will be masked with ``na_sentinel``. - na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. - assume_unique : bool, default False - When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. - verify : bool, default True - Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. Ignored when ``labels`` is None. - - .. versionadded:: 0.25.0 - - Returns - ------- - ordered : ndarray - Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. - - Raises - ------ - TypeError - * If ``values`` is not list-like or if ``labels`` is neither None - nor list-like - * If ``values`` cannot be sorted - ValueError - * If ``labels`` is not None and ``values`` contain duplicates. - """ - if not is_list_like(values): - raise TypeError( - "Only list-like objects are allowed to be passed to safe_sort as values" - ) - - if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): - # don't convert to string types - dtype, _ = infer_dtype_from_array(values) - values = np.asarray(values, dtype=dtype) - - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - - sorter = None - if ( - not is_extension_array_dtype(values) - and lib.infer_dtype(values, skipna=False) == "mixed-integer" - ): - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) - else: - try: - sorter = values.argsort() - ordered = values.take(sorter) - except TypeError: - # try this anyway - ordered = sort_mixed(values) - - # labels: - - if labels is None: - return ordered - - if not is_list_like(labels): - raise TypeError( - "Only list-like objects or None are allowed to be" - "passed to safe_sort as labels" - ) - labels = ensure_platform_int(np.asarray(labels)) - - from pandas import Index - - if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") - - if sorter is None: - # mixed types - hash_klass, values = algorithms._get_data_algo(values) - t = hash_klass(len(values)) - t.map_locations(values) - sorter = ensure_platform_int(t.lookup(ordered)) - - if na_sentinel == -1: - # take_1d is faster, but only works for na_sentinels of -1 - order2 = sorter.argsort() - new_labels = algorithms.take_1d(order2, labels, fill_value=-1) - if verify: - mask = (labels < -len(values)) | (labels >= len(values)) - else: - mask = None - else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - # Out of bound indices will be masked with `na_sentinel` next, so we - # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode="wrap") - - mask = labels == na_sentinel - if verify: - mask = mask | (labels < -len(values)) | (labels >= len(values)) - - if mask is not None: - np.putmask(new_labels, mask, na_sentinel) - - return ordered, ensure_platform_int(new_labels) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index fcbb000acc256..f8d9eeb211a1e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,24 +2,28 @@ from functools import wraps import re import textwrap -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union import warnings import numpy as np import pandas._libs.lib as lib import pandas._libs.ops as libops -from pandas.util._decorators import Appender, deprecate_kwarg +from pandas._typing import ArrayLike, Dtype +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, + is_object_dtype, is_re, is_scalar, - is_string_like, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -32,6 +36,10 @@ from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com +from pandas.core.construction import extract_array + +if TYPE_CHECKING: + from pandas.arrays import StringArray _cpython_optimized_encoders = ( "utf-8", @@ -44,7 +52,7 @@ ) _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") -_shared_docs = dict() # type: Dict[str, str] +_shared_docs: Dict[str, str] = dict() def cat_core(list_of_columns: List, sep: str): @@ -66,10 +74,12 @@ def cat_core(list_of_columns: List, sep: str): """ if sep == "": # no need to interleave sep if it is empty - return np.sum(list_of_columns, axis=0) + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) list_with_sep = [sep] * (2 * len(list_of_columns) - 1) list_with_sep[::2] = list_of_columns - return np.sum(list_with_sep, axis=0) + arr_with_sep = np.asarray(list_with_sep) + return np.sum(arr_with_sep, axis=0) def cat_safe(list_of_columns: List, sep: str): @@ -103,17 +113,91 @@ def cat_safe(list_of_columns: List, sep: str): raise TypeError( "Concatenation requires list-likes containing only " "strings (or missing values). Offending values found in " - "column {}".format(dtype) + f"column {dtype}" ) from None return result def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA - return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + if is_extension_array_dtype(arr.dtype): + # just StringDtype + arr = extract_array(arr) + return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + + +def _map_stringarray( + func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype +) -> ArrayLike: + """ + Map a callable over valid elements of a StringArrray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this avoids an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ + from pandas.arrays import IntegerArray, StringArray, BooleanArray + + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = np.asarray(arr) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + func, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, func, mask.view("uint8")) -def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): + +def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) @@ -123,8 +207,8 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): arr = np.asarray(arr, dtype=object) if na_mask: mask = isna(arr) + convert = not np.all(mask) try: - convert = not all(mask) result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. @@ -135,6 +219,7 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): ) if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable raise e def g(x): @@ -143,7 +228,7 @@ def g(x): except (TypeError, AttributeError): return na_value - return _map(g, arr, dtype=dtype) + return _map_object(g, arr, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: @@ -219,7 +304,7 @@ def str_count(arr, pat, flags=0): """ regex = re.compile(pat, flags=flags) f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): @@ -353,8 +438,8 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): if regex.groups > 0: warnings.warn( - "This pattern has match groups. To actually get the" - " groups, use str.extract.", + "This pattern has match groups. To actually get the " + "groups, use str.extract.", UserWarning, stacklevel=3, ) @@ -600,7 +685,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): """ # Check whether repl is valid (GH 13438, GH 15055) - if not (is_string_like(repl) or callable(repl)): + if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) @@ -634,7 +719,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_repeat(arr, repeats): @@ -685,7 +770,7 @@ def scalar_rep(x): except TypeError: return str.__mul__(x, repeats) - return _na_map(scalar_rep, arr) + return _na_map(scalar_rep, arr, dtype=str) else: def rep(x, r): @@ -1150,7 +1235,7 @@ def str_join(arr, sep): 4 NaN dtype: object """ - return _na_map(sep.join, arr) + return _na_map(sep.join, arr, dtype=str) def str_findall(arr, pat, flags=0): @@ -1270,8 +1355,8 @@ def str_find(arr, sub, start=0, end=None, side="left"): """ if not isinstance(sub, str): - msg = "expected a string object, not {0}" - raise TypeError(msg.format(type(sub).__name__)) + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) if side == "left": method = "find" @@ -1285,13 +1370,13 @@ def str_find(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_index(arr, sub, start=0, end=None, side="left"): if not isinstance(sub, str): - msg = "expected a string object, not {0}" - raise TypeError(msg.format(type(sub).__name__)) + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) if side == "left": method = "index" @@ -1305,7 +1390,7 @@ def str_index(arr, sub, start=0, end=None, side="left"): else: f = lambda x: getattr(x, method)(sub, start, end) - return _na_map(f, arr, dtype=int) + return _na_map(f, arr, dtype="int64") def str_pad(arr, width, side="left", fillchar=" "): @@ -1362,15 +1447,15 @@ def str_pad(arr, width, side="left", fillchar=" "): dtype: object """ if not isinstance(fillchar, str): - msg = "fillchar must be a character, not {0}" - raise TypeError(msg.format(type(fillchar).__name__)) + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) if len(fillchar) != 1: raise TypeError("fillchar must be a character, not str") if not is_integer(width): - msg = "width must be of integer type, not {0}" - raise TypeError(msg.format(type(width).__name__)) + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) if side == "left": f = lambda x: x.rjust(width, fillchar) @@ -1381,7 +1466,7 @@ def str_pad(arr, width, side="left", fillchar=" "): else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_split(arr, pat=None, n=None): @@ -1487,7 +1572,7 @@ def str_slice(arr, start=None, stop=None, step=None): """ obj = slice(start, stop, step) f = lambda x: x[obj] - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_slice_replace(arr, start=None, stop=None, repl=None): @@ -1578,7 +1663,7 @@ def f(x): y += x[local_stop:] return y - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_strip(arr, to_strip=None, side="both"): @@ -1603,7 +1688,7 @@ def str_strip(arr, to_strip=None, side="both"): f = lambda x: x.rstrip(to_strip) else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_wrap(arr, width, **kwargs): @@ -1667,7 +1752,7 @@ def str_wrap(arr, width, **kwargs): tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) def str_translate(arr, table): @@ -1687,7 +1772,7 @@ def str_translate(arr, table): ------- Series or Index """ - return _na_map(lambda x: x.translate(table), arr) + return _na_map(lambda x: x.translate(table), arr, dtype=str) def str_get(arr, i): @@ -1855,10 +1940,8 @@ def _forbid_nonstring_types(func): def wrapper(self, *args, **kwargs): if self._inferred_dtype not in allowed_types: msg = ( - "Cannot use .str.{name} with values of inferred dtype " - "{inf_type!r}.".format( - name=func_name, inf_type=self._inferred_dtype - ) + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." ) raise TypeError(msg) return func(self, *args, **kwargs) @@ -1875,7 +1958,7 @@ def _noarg_wrapper( docstring=None, forbidden_types=["bytes"], returns_string=True, - **kargs + **kargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): @@ -1898,7 +1981,7 @@ def _pat_wrapper( name=None, forbidden_types=["bytes"], returns_string=True, - **kwargs + **kwargs, ): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): @@ -2015,6 +2098,11 @@ def __getitem__(self, key): return self.get(key) def __iter__(self): + warnings.warn( + "Columnar iteration over characters will be deprecated in future releases.", + FutureWarning, + stacklevel=2, + ) i = 0 g = self.get(i) while g.notna().any(): @@ -2552,9 +2640,6 @@ def rsplit(self, pat=None, n=-1, expand=False): ---------- sep : str, default whitespace String to split on. - pat : str, default whitespace - .. deprecated:: 0.24.0 - Use ``sep`` instead. expand : bool, default True If True, return DataFrame/MultiIndex expanding dimensionality. If False, return Series/Index. @@ -2632,7 +2717,6 @@ def rsplit(self, pat=None, n=-1, expand=False): "also": "rpartition : Split the string at the last occurrence of `sep`.", } ) - @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") @forbid_nonstring_types(["bytes"]) def partition(self, sep=" ", expand=True): f = lambda x: x.partition(sep) @@ -2648,7 +2732,6 @@ def partition(self, sep=" ", expand=True): "also": "partition : Split the string at the first occurrence of `sep`.", } ) - @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") @forbid_nonstring_types(["bytes"]) def rpartition(self, sep=" ", expand=True): f = lambda x: x.rpartition(sep) @@ -3025,7 +3108,7 @@ def normalize(self, form): import unicodedata f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent) + result = _na_map(f, self._parent, dtype=str) return self._wrap_result(result) _shared_docs[ @@ -3132,7 +3215,7 @@ def rindex(self, sub, start=0, end=None): len, docstring=_shared_docs["len"], forbidden_types=None, - dtype=int, + dtype="int64", returns_string=False, ) @@ -3206,7 +3289,7 @@ def rindex(self, sub, start=0, end=None): """ # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args = {} # type: Dict[str, Dict[str, str]] + _doc_args: Dict[str, Dict[str, str]] = {} _doc_args["lower"] = dict(type="lowercase", method="lower", version="") _doc_args["upper"] = dict(type="uppercase", method="upper", version="") _doc_args["title"] = dict(type="titlecase", method="title", version="") @@ -3223,31 +3306,37 @@ def rindex(self, sub, start=0, end=None): lambda x: x.lower(), name="lower", docstring=_shared_docs["casemethods"] % _doc_args["lower"], + dtype=str, ) upper = _noarg_wrapper( lambda x: x.upper(), name="upper", docstring=_shared_docs["casemethods"] % _doc_args["upper"], + dtype=str, ) title = _noarg_wrapper( lambda x: x.title(), name="title", docstring=_shared_docs["casemethods"] % _doc_args["title"], + dtype=str, ) capitalize = _noarg_wrapper( lambda x: x.capitalize(), name="capitalize", docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + dtype=str, ) swapcase = _noarg_wrapper( lambda x: x.swapcase(), name="swapcase", docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + dtype=str, ) casefold = _noarg_wrapper( lambda x: x.casefold(), name="casefold", docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + dtype=str, ) _shared_docs[ @@ -3401,59 +3490,69 @@ def rindex(self, sub, start=0, end=None): _doc_args["istitle"] = dict(type="titlecase", method="istitle") _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=bool (GH 29624) isalnum = _noarg_wrapper( lambda x: x.isalnum(), name="isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], returns_string=False, + dtype=bool, ) isalpha = _noarg_wrapper( lambda x: x.isalpha(), name="isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], returns_string=False, + dtype=bool, ) isdigit = _noarg_wrapper( lambda x: x.isdigit(), name="isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], returns_string=False, + dtype=bool, ) isspace = _noarg_wrapper( lambda x: x.isspace(), name="isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"], returns_string=False, + dtype=bool, ) islower = _noarg_wrapper( lambda x: x.islower(), name="islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"], returns_string=False, + dtype=bool, ) isupper = _noarg_wrapper( lambda x: x.isupper(), name="isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"], returns_string=False, + dtype=bool, ) istitle = _noarg_wrapper( lambda x: x.istitle(), name="istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"], returns_string=False, + dtype=bool, ) isnumeric = _noarg_wrapper( lambda x: x.isnumeric(), name="isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], returns_string=False, + dtype=bool, ) isdecimal = _noarg_wrapper( lambda x: x.isdecimal(), name="isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], returns_string=False, + dtype=bool, ) @classmethod diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 70143e4603a4b..cfa42d764ee44 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1,6 +1,7 @@ from collections import abc from datetime import datetime, time from functools import partial +from itertools import islice from typing import Optional, TypeVar, Union import numpy as np @@ -14,7 +15,7 @@ parse_time_string, ) from pandas._libs.tslibs.strptime import array_strptime -from pandas.util._decorators import deprecate_kwarg +from pandas._typing import ArrayLike from pandas.core.dtypes.common import ( ensure_object, @@ -37,7 +38,7 @@ ) from pandas.core.dtypes.missing import notna -from pandas._typing import ArrayLike +from pandas.arrays import IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique @@ -45,12 +46,6 @@ # types used in annotations ArrayConvertible = Union[list, tuple, ArrayLike, ABCSeries] - -# --------------------------------------------------------------------- - -# --------------------------------------------------------------------- -# types used in annotations - Scalar = Union[int, float, str] DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) DatetimeScalarOrArrayConvertible = Union[ @@ -118,7 +113,7 @@ def should_cache( assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" - unique_elements = unique(arg[:check_count]) + unique_elements = set(islice(arg, check_count)) if len(unique_elements) > check_count * unique_share: do_caching = False return do_caching @@ -145,7 +140,8 @@ def _maybe_cache(arg, format, cache, convert_listlike): """ from pandas import Series - cache_array = Series() + cache_array = Series(dtype=object) + if cache: # Perform a quicker unique check if not should_cache(arg): @@ -153,7 +149,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): unique_dates = unique(arg) if len(unique_dates) < len(arg): - cache_dates = convert_listlike(unique_dates, True, format) + cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array @@ -168,7 +164,7 @@ def _box_as_indexlike( Parameters ---------- dt_array: 1-d array - array of datetimes to be boxed + Array of datetimes to be wrapped in an Index. tz : object None or 'utc' name : string, default None @@ -191,37 +187,30 @@ def _box_as_indexlike( def _convert_and_box_cache( arg: DatetimeScalarOrArrayConvertible, cache_array: ABCSeries, - box: bool, name: Optional[str] = None, -) -> Union[ABCIndex, np.ndarray]: +) -> ABCIndexClass: """ - Convert array of dates with a cache and box the result + Convert array of dates with a cache and wrap the result in an Index. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series cache_array : Series Cache of converted, unique dates - box : boolean - True boxes result as an Index-like, False returns an ndarray name : string, default None Name for a DatetimeIndex Returns ------- - result : datetime of converted dates - - Index-like if box=True - - ndarray if box=False + result : Index-like of converted dates """ from pandas import Series result = Series(arg).map(cache_array) - if box: - return _box_as_indexlike(result, utc=None, name=name) - return result.values + return _box_as_indexlike(result, utc=None, name=name) -def _return_parsed_timezone_results(result, timezones, box, tz, name): +def _return_parsed_timezone_results(result, timezones, tz, name): """ Return results from array_strptime if a %z or %Z directive was passed. @@ -231,8 +220,6 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): int64 date representations of the dates timezones : ndarray pytz timezone objects - box : boolean - True boxes result as an Index-like, False returns an ndarray tz : object None or pytz timezone object name : string, default None @@ -240,11 +227,7 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): Returns ------- - tz_result : ndarray of parsed dates with timezone - Returns: - - - Index-like if box=True - - ndarray of Timestamps if box=False + tz_result : Index-like of parsed dates with timezone """ if tz is not None: raise ValueError( @@ -255,16 +238,13 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): tz_results = np.array( [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] ) - if box: - from pandas import Index + from pandas import Index - return Index(tz_results, name=name) - return tz_results + return Index(tz_results, name=name) def _convert_listlike_datetimes( arg, - box, format, name=None, tz=None, @@ -283,8 +263,6 @@ def _convert_listlike_datetimes( ---------- arg : list, tuple, ndarray, Series, Index date to be parced - box : boolean - True boxes result as an Index-like, False returns an ndarray name : object None or string for the Index name tz : object @@ -304,11 +282,7 @@ def _convert_listlike_datetimes( Returns ------- - ndarray of parsed dates - Returns: - - - Index-like if box=True - - ndarray of Timestamps if box=False + Index-like of parsed dates """ from pandas import DatetimeIndex from pandas.core.arrays import DatetimeArray @@ -329,7 +303,7 @@ def _convert_listlike_datetimes( return arg elif is_datetime64_ns_dtype(arg): - if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)): + if not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: @@ -343,28 +317,40 @@ def _convert_listlike_datetimes( elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, "values", arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) - if box: - if errors == "ignore": - from pandas import Index + arg = getattr(arg, "_values", arg) + + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + # Explicitly pass NaT mask to array_with_unit_to_datetime + mask = arg.isna() + arg = arg._ndarray_values + else: + mask = None + + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, mask, unit, errors=errors + ) + + if errors == "ignore": + from pandas import Index - result = Index(result, name=name) + result = Index(result, name=name) + else: + result = DatetimeIndex(result, name=name) + # GH 23758: We may still need to localize the result with tz + # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) + # result will be naive but in UTC + try: + result = result.tz_localize("UTC").tz_convert(tz_parsed) + except AttributeError: + # Regular Index from 'ignore' path + return result + if tz is not None: + if result.tz is None: + result = result.tz_localize(tz) else: - result = DatetimeIndex(result, name=name) - # GH 23758: We may still need to localize the result with tz - # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) - # result will be naive but in UTC - try: - result = result.tz_localize("UTC").tz_convert(tz_parsed) - except AttributeError: - # Regular Index from 'ignore' path - return result - if tz is not None: - if result.tz is None: - result = result.tz_localize(tz) - else: - result = result.tz_convert(tz) + result = result.tz_convert(tz) return result elif getattr(arg, "ndim", 1) > 1: raise TypeError( @@ -415,7 +401,7 @@ def _convert_listlike_datetimes( ) if "%Z" in format or "%z" in format: return _return_parsed_timezone_results( - result, timezones, box, tz, name + result, timezones, tz, name ) except tslibs.OutOfBoundsDatetime: if errors == "raise": @@ -462,20 +448,12 @@ def _convert_listlike_datetimes( ) if tz_parsed is not None: - if box: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) - else: - # Convert the datetime64 numpy array to an numpy array - # of datetime objects - result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() for ts in result] - return np.array(result, dtype=object) + # We can take a shortcut since the datetime64 numpy array + # is in UTC + return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) - if box: - utc = tz == "utc" - return _box_as_indexlike(result, utc=utc, name=name) - return result + utc = tz == "utc" + return _box_as_indexlike(result, utc=utc, name=name) def _adjust_to_origin(arg, origin, unit): @@ -511,8 +489,7 @@ def _adjust_to_origin(arg, origin, unit): j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( - "{original} is Out of Bounds for " - "origin='julian'".format(original=original) + f"{original} is Out of Bounds for origin='julian'" ) else: # arg must be numeric @@ -521,27 +498,20 @@ def _adjust_to_origin(arg, origin, unit): or is_numeric_dtype(np.asarray(arg)) ): raise ValueError( - "'{arg}' is not compatible with origin='{origin}'; " - "it must be numeric with a unit specified ".format( - arg=arg, origin=origin - ) + f"'{arg}' is not compatible with origin='{origin}'; " + "it must be numeric with a unit specified" ) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: - raise tslibs.OutOfBoundsDatetime( - "origin {origin} is Out of Bounds".format(origin=origin) - ) + raise tslibs.OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") except ValueError: - raise ValueError( - "origin {origin} cannot be converted " - "to a Timestamp".format(origin=origin) - ) + raise ValueError(f"origin {origin} cannot be converted to a Timestamp") if offset.tz is not None: - raise ValueError("origin offset {} must be tz-naive".format(offset)) + raise ValueError(f"origin offset {offset} must be tz-naive") offset -= Timestamp(0) # convert the offset to the unit of the arg @@ -557,14 +527,12 @@ def _adjust_to_origin(arg, origin, unit): return arg -@deprecate_kwarg(old_arg_name="box", new_arg_name=None) def to_datetime( arg, errors="raise", dayfirst=False, yearfirst=False, utc=None, - box=True, format=None, exact=True, unit=None, @@ -577,14 +545,12 @@ def to_datetime( Parameters ---------- - arg : int, float, str, datetime, list, tuple, 1-d array, Series - or DataFrame/dict-like - + arg : int, float, str, datetime, list, tuple, 1-d array, Series DataFrame/dict-like + The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaT - - If 'ignore', then invalid parsing will return the input + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaT. + - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as @@ -604,28 +570,18 @@ def to_datetime( utc : bool, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). - box : bool, default True - - - If True returns a DatetimeIndex or Index-like object - - If False returns ndarray of values. - - .. deprecated:: 0.25.0 - Use :meth:`Series.to_numpy` or :meth:`Timestamp.to_datetime64` - instead to get an ndarray of values or numpy.datetime64, - respectively. - format : str, default None - strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse + The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: - https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. exact : bool, True by default - + Behaves as: - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : str, default 'ns' - unit of the arg (D,s,ms,us,ns) denote the unit, which is an + The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. @@ -652,11 +608,12 @@ def to_datetime( .. versionadded:: 0.23.0 .. versionchanged:: 0.25.0 - - changed default value from False to True + - changed default value from False to True. Returns ------- - ret : datetime if parsing succeeded. + datetime + If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex @@ -688,7 +645,7 @@ def to_datetime( dtype: datetime64[ns] If a date does not meet the `timestamp limitations - `_, passing errors='ignore' will return the original input instead of raising any exception. @@ -712,10 +669,10 @@ def to_datetime( 4 3/12/2000 dtype: object - >>> %timeit pd.to_datetime(s,infer_datetime_format=True) # doctest: +SKIP + >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP 100 loops, best of 3: 10.4 ms per loop - >>> %timeit pd.to_datetime(s,infer_datetime_format=False) # doctest: +SKIP + >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP 1 loop, best of 3: 471 ms per loop Using a unix epoch time @@ -765,25 +722,25 @@ def to_datetime( if not cache_array.empty: result = arg.map(cache_array) else: - values = convert_listlike(arg._values, True, format) + values = convert_listlike(arg._values, format) result = arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): - result = _assemble_from_unit_mappings(arg, errors, box, tz) + result = _assemble_from_unit_mappings(arg, errors, tz) elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, name=arg.name) + result = _convert_and_box_cache(arg, cache_array, name=arg.name) else: convert_listlike = partial(convert_listlike, name=arg.name) - result = convert_listlike(arg, box, format) + result = convert_listlike(arg, format) elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box) + result = _convert_and_box_cache(arg, cache_array) else: - result = convert_listlike(arg, box, format) + result = convert_listlike(arg, format) else: - result = convert_listlike(np.array([arg]), box, format)[0] + result = convert_listlike(np.array([arg]), format)[0] return result @@ -814,7 +771,7 @@ def to_datetime( } -def _assemble_from_unit_mappings(arg, errors, box, tz): +def _assemble_from_unit_mappings(arg, errors, tz): """ assemble the unit specified fields from the arg (DataFrame) Return a Series for actual parsing @@ -827,10 +784,6 @@ def _assemble_from_unit_mappings(arg, errors, box, tz): - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input - box : boolean - - - If True, return a DatetimeIndex - - If False, return an array tz : None or 'utc' Returns @@ -859,21 +812,21 @@ def f(value): # we require at least Ymd required = ["year", "month", "day"] - req = sorted(list(set(required) - set(unit_rev.keys()))) + req = sorted(set(required) - set(unit_rev.keys())) if len(req): + required = ",".join(req) raise ValueError( "to assemble mappings requires at least that " - "[year, month, day] be specified: [{required}] " - "is missing".format(required=",".join(req)) + f"[year, month, day] be specified: [{required}] " + "is missing" ) # keys we don't recognize - excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) + excess = sorted(set(unit_rev.keys()) - set(_unit_map.values())) if len(excess): + excess = ",".join(excess) raise ValueError( - "extra keys have been passed " - "to the datetime assemblage: " - "[{excess}]".format(excess=",".join(excess)) + f"extra keys have been passed to the datetime assemblage: [{excess}]" ) def coerce(values): @@ -892,21 +845,16 @@ def coerce(values): ) try: values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz) - except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the datetimes: {error}".format(error=e)) + except (TypeError, ValueError) as err: + raise ValueError(f"cannot assemble the datetimes: {err}") for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: try: values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) - except (TypeError, ValueError) as e: - raise ValueError( - "cannot assemble the datetimes [{value}]: " - "{error}".format(value=value, error=e) - ) - if not box: - return values.values + except (TypeError, ValueError) as err: + raise ValueError(f"cannot assemble the datetimes [{value}]: {err}") return values @@ -942,21 +890,21 @@ def calc_with_mask(carg, mask): # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) - except (ValueError, OverflowError): + except (ValueError, OverflowError, TypeError): pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) - except (ValueError, OverflowError): + except (ValueError, OverflowError, TypeError): pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) - except (ValueError, OverflowError): + except (ValueError, OverflowError, TypeError): pass return None @@ -1041,9 +989,9 @@ def _convert_listlike(arg, format): except (ValueError, TypeError): if errors == "raise": msg = ( - "Cannot convert {element} to a time with given " - "format {format}" - ).format(element=element, format=format) + f"Cannot convert {element} to a time with given " + f"format {format}" + ) raise ValueError(msg) elif errors == "ignore": return arg @@ -1069,9 +1017,7 @@ def _convert_listlike(arg, format): if time_object is not None: times.append(time_object) elif errors == "raise": - raise ValueError( - "Cannot convert arg {arg} to a time".format(arg=arg) - ) + raise ValueError(f"Cannot convert arg {arg} to a time") elif errors == "ignore": return arg else: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 05696ffd4605d..e59ed247bd87b 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -36,9 +36,9 @@ def to_numeric(arg, errors="raise", downcast=None): ---------- arg : scalar, list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception - - If 'coerce', then invalid parsing will be set as NaN - - If 'ignore', then invalid parsing will return the input + - If 'raise', then invalid parsing will raise an exception. + - If 'coerce', then invalid parsing will be set as NaN. + - If 'ignore', then invalid parsing will return the input. downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index cc31317980ca8..3e185feaea38e 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -2,13 +2,10 @@ timedelta support tools """ -import warnings - import numpy as np from pandas._libs.tslibs import NaT from pandas._libs.tslibs.timedeltas import Timedelta, parse_timedelta_unit -from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -16,8 +13,7 @@ from pandas.core.arrays.timedeltas import sequence_to_td64ns -@deprecate_kwarg(old_arg_name="box", new_arg_name=None) -def to_timedelta(arg, unit="ns", box=True, errors="raise"): +def to_timedelta(arg, unit="ns", errors="raise"): """ Convert argument to timedelta. @@ -38,15 +34,6 @@ def to_timedelta(arg, unit="ns", box=True, errors="raise"): 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U', 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'). - box : bool, default True - - If True returns a Timedelta/TimedeltaIndex of the results. - - If False returns a numpy.timedelta64 or numpy.darray of - values of dtype timedelta64[ns]. - - .. deprecated:: 0.25.0 - Use :meth:`Series.to_numpy` or :meth:`Timedelta.to_timedelta64` - instead to get an ndarray of values or numpy.timedelta64, - respectively. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. @@ -88,11 +75,6 @@ def to_timedelta(arg, unit="ns", box=True, errors="raise"): >>> pd.to_timedelta(np.arange(5), unit='d') TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) - - Returning an ndarray by using the 'box' keyword argument: - - >>> pd.to_timedelta(np.arange(5), box=False) - array([0, 1, 2, 3, 4], dtype='timedelta64[ns]') """ unit = parse_timedelta_unit(unit) @@ -100,41 +82,37 @@ def to_timedelta(arg, unit="ns", box=True, errors="raise"): raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") if unit in {"Y", "y", "M"}: - warnings.warn( - "M and Y units are deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, + raise ValueError( + "Units 'M' and 'Y' are no longer supported, as they do not " + "represent unambiguous timedelta values durations." ) if arg is None: return arg elif isinstance(arg, ABCSeries): - values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) + values = _convert_listlike(arg._values, unit=unit, errors=errors) return arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) + return _convert_listlike(arg, unit=unit, errors=errors, name=arg.name) elif isinstance(arg, np.ndarray) and arg.ndim == 0: # extract array scalar and process below arg = arg.item() elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1: - return _convert_listlike(arg, unit=unit, box=box, errors=errors) + return _convert_listlike(arg, unit=unit, errors=errors) elif getattr(arg, "ndim", 1) > 1: raise TypeError( "arg must be a string, timedelta, list, tuple, 1-d array, or Series" ) # ...so it must be a scalar value. Return scalar. - return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors) + return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors) -def _coerce_scalar_to_timedelta_type(r, unit="ns", box=True, errors="raise"): +def _coerce_scalar_to_timedelta_type(r, unit="ns", errors="raise"): """Convert string 'r' to a timedelta object.""" try: result = Timedelta(r, unit) - if not box: - # explicitly view as timedelta64 for case when result is pd.NaT - result = result.asm8.view("timedelta64[ns]") except ValueError: if errors == "raise": raise @@ -147,7 +125,7 @@ def _coerce_scalar_to_timedelta_type(r, unit="ns", box=True, errors="raise"): return result -def _convert_listlike(arg, unit="ns", box=True, errors="raise", name=None): +def _convert_listlike(arg, unit="ns", errors="raise", name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"): @@ -172,8 +150,7 @@ def _convert_listlike(arg, unit="ns", box=True, errors="raise", name=None): # like to surface it. raise - if box: - from pandas import TimedeltaIndex + from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit="ns", name=name) + value = TimedeltaIndex(value, unit="ns", name=name) return value diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fddbea8ed0d7a..43655fa3ea913 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -58,7 +58,7 @@ def hash_pandas_object( obj, index: bool = True, encoding: str = "utf8", - hash_key=None, + hash_key: str = _default_hash_key, categorize: bool = True, ): """ @@ -67,11 +67,11 @@ def hash_pandas_object( Parameters ---------- index : bool, default True - include the index in the hash (if Series/DataFrame) + Include the index in the hash (if Series/DataFrame). encoding : str, default 'utf8' - encoding for data & key when strings - hash_key : str, default '_default_hash_key' - hash_key for string key to encode + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. @@ -82,17 +82,15 @@ def hash_pandas_object( """ from pandas import Series - if hash_key is None: - hash_key = _default_hash_key - if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) - if isinstance(obj, ABCIndexClass): + elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) + elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False @@ -136,11 +134,11 @@ def hash_pandas_object( h = Series(h, index=obj.index, dtype="uint64", copy=False) else: - raise TypeError("Unexpected type for hashing %s" % type(obj)) + raise TypeError(f"Unexpected type for hashing {type(obj)}") return h -def hash_tuples(vals, encoding="utf8", hash_key=None): +def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): """ Hash an MultiIndex / list-of-tuples efficiently @@ -148,7 +146,7 @@ def hash_tuples(vals, encoding="utf8", hash_key=None): ---------- vals : MultiIndex, list-of-tuples, or single tuple encoding : str, default 'utf8' - hash_key : str, default '_default_hash_key' + hash_key : str, default _default_hash_key Returns ------- @@ -183,7 +181,7 @@ def hash_tuples(vals, encoding="utf8", hash_key=None): return h -def hash_tuple(val, encoding: str = "utf8", hash_key=None): +def hash_tuple(val, encoding: str = "utf8", hash_key: str = _default_hash_key): """ Hash a single tuple efficiently @@ -191,7 +189,7 @@ def hash_tuple(val, encoding: str = "utf8", hash_key=None): ---------- val : single tuple encoding : str, default 'utf8' - hash_key : str, default '_default_hash_key' + hash_key : str, default _default_hash_key Returns ------- @@ -213,8 +211,8 @@ def _hash_categorical(c, encoding: str, hash_key: str): Parameters ---------- c : Categorical - encoding : str, default 'utf8' - hash_key : str, default '_default_hash_key' + encoding : str + hash_key : str Returns ------- @@ -243,7 +241,12 @@ def _hash_categorical(c, encoding: str, hash_key: str): return result -def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = True): +def hash_array( + vals, + encoding: str = "utf8", + hash_key: str = _default_hash_key, + categorize: bool = True, +): """ Given a 1d array, return an array of deterministic integers. @@ -251,9 +254,9 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T ---------- vals : ndarray, Categorical encoding : str, default 'utf8' - encoding for data & key when strings - hash_key : str, default '_default_hash_key' - hash_key for string key to encode + Encoding for data & key when strings. + hash_key : str, default _default_hash_key + Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. @@ -267,9 +270,6 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T raise TypeError("must pass a ndarray-like") dtype = vals.dtype - if hash_key is None: - hash_key = _default_hash_key - # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). @@ -320,9 +320,17 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T return vals -def _hash_scalar(val, encoding: str = "utf8", hash_key=None): +def _hash_scalar( + val, encoding: str = "utf8", hash_key: str = _default_hash_key +) -> np.ndarray: """ - Hash scalar value + Hash scalar value. + + Parameters + ---------- + val : scalar + encoding : str, default "utf8" + hash_key : str, default _default_hash_key Returns ------- diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 0f2920b3558c9..64ec0e68e11b0 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,5 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict +from typing import Callable, Optional import warnings import numpy as np @@ -10,7 +11,7 @@ import pandas.core.common as com from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin -from pandas.core.index import MultiIndex +from pandas.core.indexes.api import MultiIndex _shared_docs = dict(**_shared_docs) _doc_template = """ @@ -26,13 +27,29 @@ """ -class _GroupByMixin(GroupByMixin): +def _dispatch(name: str, *args, **kwargs): + """ + Dispatch to apply. + """ + + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer + + +class WindowGroupByMixin(GroupByMixin): """ Provide the groupby facilities. """ def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop("parent", None) # noqa + kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: groupby, obj = obj, obj.obj @@ -41,18 +58,28 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - count = GroupByMixin._dispatch("count") - corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) - cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) + count = _dispatch("count") + corr = _dispatch("corr", other=None, pairwise=None) + cov = _dispatch("cov", other=None, pairwise=None) def _apply( - self, func, name=None, window=None, center=None, check_minp=None, **kwargs + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, ): """ Dispatch to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object. """ + kwargs.pop("floor", None) + # TODO: can we de-duplicate with _dispatch? def f(x, name=name, *args): x = self._shallow_copy(x) @@ -78,7 +105,7 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries) ): - X, Y = _prep_binary(arg1, arg2) + X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): @@ -125,7 +152,7 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = results[j][i] else: results[i][j] = f( - *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) ) from pandas import concat @@ -186,7 +213,7 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'pairwise' is not True/False") else: results = { - i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) @@ -223,34 +250,48 @@ def _get_center_of_mass(comass, span, halflife, alpha): return float(comass) -def _offset(window, center): +def calculate_center_offset(window): if not is_integer(window): window = len(window) - offset = (window - 1) / 2.0 if center else 0 - try: - return int(offset) - except TypeError: - return offset.astype(int) + return int((window - 1) / 2.0) -def _require_min_periods(p): - def _check_func(minp, window): - if minp is None: - return window - else: - return max(p, minp) - - return _check_func - - -def _use_window(minp, window): - if minp is None: - return window +def calculate_min_periods( + window: int, + min_periods: Optional[int], + num_values: int, + required_min_periods: int, + floor: int, +) -> int: + """ + Calculates final minimum periods value for rolling aggregations. + + Parameters + ---------- + window : passed window value + min_periods : passed min periods value + num_values : total number of values + required_min_periods : required min periods per aggregation function + floor : required min periods per aggregation function + + Returns + ------- + min_periods : int + """ + if min_periods is None: + min_periods = window else: - return minp + min_periods = max(required_min_periods, min_periods) + if min_periods > window: + raise ValueError(f"min_periods {min_periods} must be <= window {window}") + elif min_periods > num_values: + min_periods = num_values + 1 + elif min_periods < 0: + raise ValueError("min_periods must be >= 0") + return max(min_periods, floor) -def _zsqrt(x): +def zsqrt(x): with np.errstate(all="ignore"): result = np.sqrt(x) mask = x < 0 @@ -265,7 +306,7 @@ def _zsqrt(x): return result -def _prep_binary(arg1, arg2): +def prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): raise Exception("Input arrays must be of the same type!") @@ -274,3 +315,12 @@ def _prep_binary(arg1, arg2): Y = arg2 + 0 * arg1 return X, Y + + +def get_weighted_roll_func(cfunc: Callable) -> Callable: + def func(arg, window, min_periods=None): + if min_periods is None: + min_periods = len(window) + return cfunc(arg, window, min_periods) + + return func diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 40e6c679ba72d..37e3cd42f2115 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -2,15 +2,20 @@ import numpy as np -import pandas._libs.window as libwindow +import pandas._libs.window.aggregations as window_aggregations from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.base import DataError -from pandas.core.window.common import _doc_template, _get_center_of_mass, _shared_docs -from pandas.core.window.rolling import _flex_binary_moment, _Rolling, _zsqrt +from pandas.core.window.common import ( + _doc_template, + _get_center_of_mass, + _shared_docs, + zsqrt, +) +from pandas.core.window.rolling import _flex_binary_moment, _Rolling _bias_template = """ Parameters @@ -21,25 +26,6 @@ Arguments and keyword arguments to be passed into func. """ -_pairwise_template = """ - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - If not supplied then will default to self and produce pairwise - output. - pairwise : bool, default None - If False then only matching columns between self and other will be - used and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndex DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - bias : bool, default False - Use a standard estimation bias correction. - **kwargs - Keyword arguments to be passed into func. -""" - class EWM(_Rolling): r""" @@ -108,7 +94,7 @@ class EWM(_Rolling): (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at - http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows Examples -------- @@ -247,11 +233,10 @@ def _apply(self, func, **kwargs): # if we have a string function name, wrap it if isinstance(func, str): - cfunc = getattr(libwindow, func, None) + cfunc = getattr(window_aggregations, func, None) if cfunc is None: raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) + f"we do not support this function in window_aggregations.{func}" ) def func(arg): @@ -289,7 +274,7 @@ def std(self, bias=False, *args, **kwargs): Exponential weighted moving stddev. """ nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(bias=bias, **kwargs)) + return zsqrt(self.var(bias=bias, **kwargs)) vol = std @@ -303,7 +288,7 @@ def var(self, bias=False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) def f(arg): - return libwindow.ewmcov( + return window_aggregations.ewmcov( arg, arg, self.com, @@ -317,10 +302,26 @@ def f(arg): @Substitution(name="ewm") @Appender(_doc_template) - @Appender(_pairwise_template) def cov(self, other=None, pairwise=None, bias=False, **kwargs): """ Exponential weighted sample covariance. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction. + **kwargs + Keyword arguments to be passed into func. """ if other is None: other = self._selected_obj @@ -331,7 +332,7 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = libwindow.ewmcov( + cov = window_aggregations.ewmcov( X._prep_values(), Y._prep_values(), self.com, @@ -348,10 +349,24 @@ def _get_cov(X, Y): @Substitution(name="ewm") @Appender(_doc_template) - @Appender(_pairwise_template) def corr(self, other=None, pairwise=None, **kwargs): """ Exponential weighted sample correlation. + + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + **kwargs + Keyword arguments to be passed into func. """ if other is None: other = self._selected_obj @@ -364,7 +379,7 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return libwindow.ewmcov( + return window_aggregations.ewmcov( x, y, self.com, @@ -380,7 +395,7 @@ def _cov(x, y): cov = _cov(x_values, y_values) x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) + corr = cov / zsqrt(x_var * y_var) return X._wrap_result(corr) return _flex_binary_moment( diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 47bd8f2ec593b..68c3514308cbc 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -3,7 +3,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution -from pandas.core.window.common import _doc_template, _GroupByMixin, _shared_docs +from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs from pandas.core.window.rolling import _Rolling_and_Expanding @@ -148,7 +148,7 @@ def count(self, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=None, args=(), kwargs={}): + def apply(self, func, raw=False, args=(), kwargs={}): return super().apply(func, raw=raw, args=args, kwargs=kwargs) @Substitution(name="expanding") @@ -181,13 +181,13 @@ def mean(self, *args, **kwargs): def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name="expanding") + @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="expanding") + @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) @@ -209,10 +209,9 @@ def skew(self, **kwargs): >>> arr = [1, 2, 3, 4, 999] >>> import scipy.stats - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits - >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") -1.200000 - >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) + >>> print(f"{scipy.stats.kurtosis(arr, bias=False):.6f}") 4.999874 >>> s = pd.Series(arr) >>> s.expanding(4).kurt() @@ -250,7 +249,7 @@ def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) -class ExpandingGroupby(_GroupByMixin, Expanding): +class ExpandingGroupby(WindowGroupByMixin, Expanding): """ Provide a expanding groupby implementation. """ diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py new file mode 100644 index 0000000000000..0fa24a0ba1b5a --- /dev/null +++ b/pandas/core/window/indexers.py @@ -0,0 +1,122 @@ +"""Indexer objects for computing start/end window bounds for rolling operations""" +from typing import Optional, Tuple + +import numpy as np + +from pandas._libs.window.indexers import calculate_variable_window_bounds +from pandas.util._decorators import Appender + +get_window_bounds_doc = """ +Computes the bounds of a window. + +Parameters +---------- +num_values : int, default 0 + number of values that will be aggregated over +window_size : int, default 0 + the number of rows in a window +min_periods : int, default None + min_periods passed from the top level rolling API +center : bool, default None + center passed from the top level rolling API +closed : str, default None + closed passed from the top level rolling API +win_type : str, default None + win_type passed from the top level rolling API + +Returns +------- +A tuple of ndarray[int64]s, indicating the boundaries of each +window +""" + + +class BaseIndexer: + """Base class for window bounds calculations""" + + def __init__( + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + ): + """ + Parameters + ---------- + **kwargs : + keyword arguments that will be available when get_window_bounds is called + """ + self.index_array = index_array + self.window_size = window_size + # Set user defined kwargs as attributes that can be used in get_window_bounds + for key, value in kwargs.items(): + setattr(self, key, value) + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + raise NotImplementedError + + +class FixedWindowIndexer(BaseIndexer): + """Creates window boundaries that are of fixed length.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + start_s = np.zeros(self.window_size, dtype="int64") + start_e = ( + np.arange(self.window_size, num_values, dtype="int64") + - self.window_size + + 1 + ) + start = np.concatenate([start_s, start_e])[:num_values] + + end_s = np.arange(self.window_size, dtype="int64") + 1 + end_e = start_e + self.window_size + end = np.concatenate([end_s, end_e])[:num_values] + return start, end + + +class VariableWindowIndexer(BaseIndexer): + """Creates window boundaries that are of variable length, namely for time series.""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return calculate_variable_window_bounds( + num_values, self.window_size, min_periods, center, closed, self.index_array, + ) + + +class ExpandingIndexer(BaseIndexer): + """Calculate expanding window bounds, mimicking df.expanding()""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return ( + np.zeros(num_values, dtype=np.int64), + np.arange(1, num_values + 1, dtype=np.int64), + ) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py new file mode 100644 index 0000000000000..127957943d2ff --- /dev/null +++ b/pandas/core/window/numba_.py @@ -0,0 +1,127 @@ +import types +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + + +def make_rolling_apply( + func: Callable[..., Scalar], + args: Tuple, + nogil: bool, + parallel: bool, + nopython: bool, +): + """ + Creates a JITted rolling apply function with a JITted version of + the user's function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + args : tuple + *args to be passed into the function + nogil : bool + nogil parameter from engine_kwargs for numba.jit + parallel : bool + parallel parameter from engine_kwargs for numba.jit + nopython : bool + nopython parameter from engine_kwargs for numba.jit + + Returns + ------- + Numba function + """ + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + +def generate_numba_apply_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 68eb1f630bfc3..f612826132fd7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -3,13 +3,15 @@ similar to how we have a Groupby object. """ from datetime import timedelta +from functools import partial +import inspect from textwrap import dedent -from typing import Callable, List, Optional, Set, Union -import warnings +from typing import Callable, Dict, List, Optional, Set, Tuple, Union import numpy as np -import pandas._libs.window as libwindow +import pandas._libs.window.aggregations as window_aggregations +from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -22,7 +24,6 @@ is_integer_dtype, is_list_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ( @@ -34,24 +35,29 @@ ABCTimedeltaIndex, ) -from pandas._typing import Axis, FrameOrSeries, Scalar -from pandas.core.base import DataError, PandasObject, SelectionMixin +from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin import pandas.core.common as com -from pandas.core.index import Index, ensure_index +from pandas.core.indexes.api import Index, ensure_index from pandas.core.window.common import ( + WindowGroupByMixin, _doc_template, _flex_binary_moment, - _GroupByMixin, - _offset, - _require_min_periods, _shared_docs, - _use_window, - _zsqrt, + calculate_center_offset, + calculate_min_periods, + get_weighted_roll_func, + zsqrt, +) +from pandas.core.window.indexers import ( + BaseIndexer, + FixedWindowIndexer, + VariableWindowIndexer, ) +from pandas.core.window.numba_ import generate_numba_apply_func -class _Window(PandasObject, SelectionMixin): - _attributes = [ +class _Window(PandasObject, ShallowMixin, SelectionMixin): + _attributes: List[str] = [ "window", "min_periods", "center", @@ -59,8 +65,8 @@ class _Window(PandasObject, SelectionMixin): "axis", "on", "closed", - ] # type: List[str] - exclusions = set() # type: Set[str] + ] + exclusions: Set[str] = set() def __init__( self, @@ -72,7 +78,7 @@ def __init__( axis: Axis = 0, on: Optional[Union[str, Index]] = None, closed: Optional[str] = None, - **kwargs + **kwargs, ): self.__dict__.update(kwargs) @@ -86,6 +92,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() + self._numba_func_cache: Dict[Optional[str], Callable] = dict() @property def _constructor(self): @@ -103,7 +110,7 @@ def _on(self): def is_freq_type(self) -> bool: return self.win_type == "freq" - def validate(self): + def validate(self) -> None: if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") if self.min_periods is not None and not is_integer(self.min_periods): @@ -116,7 +123,27 @@ def validate(self): ]: raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: {}".format(type(self))) + raise TypeError(f"invalid type: {type(self)}") + if isinstance(self.window, BaseIndexer): + self._validate_get_window_bounds_signature(self.window) + + @staticmethod + def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: + """ + Validate that the passed BaseIndexer subclass has + a get_window_bounds with the correct signature. + """ + get_window_bounds_signature = inspect.signature( + window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(window).__name__} does not implement the correct signature for " + f"get_window_bounds" + ) def _create_blocks(self): """ @@ -156,68 +183,73 @@ def _gotitem(self, key, ndim, subset=None): self._selection = key return self - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] raise AttributeError( - "%r object has no attribute %r" % (type(self).__name__, attr) + f"'{type(self).__name__}' object has no attribute '{attr}'" ) def _dir_additions(self): return self.obj._dir_additions() - def _get_window(self, other=None, **kwargs) -> int: + def _get_win_type(self, kwargs: Dict): """ - Returns window length + Exists for compatibility, overriden by subclass Window. Parameters ---------- - other: + kwargs : dict + ignored, exists for compatibility + + Returns + ------- + None + """ + return None + + def _get_window(self, other=None, win_type: Optional[str] = None) -> int: + """ + Return window length. + + Parameters + ---------- + other : + ignored, exists for compatibility + win_type : ignored, exists for compatibility Returns ------- window : int """ + if isinstance(self.window, BaseIndexer): + return self.min_periods or 0 return self.window @property def _window_type(self) -> str: - return self.__class__.__name__ + return type(self).__name__ def __repr__(self) -> str: """ Provide a nice str repr of our rolling object. """ - attrs = ( - "{k}={v}".format(k=k, v=getattr(self, k)) - for k in self._attributes - if getattr(self, k, None) is not None - ) - return "{klass} [{attrs}]".format( - klass=self._window_type, attrs=",".join(attrs) + attrs_list = ( + f"{attr_name}={getattr(self, attr_name)}" + for attr_name in self._attributes + if getattr(self, attr_name, None) is not None ) + attrs = ",".join(attrs_list) + return f"{self._window_type} [{attrs}]" def __iter__(self): url = "https://github.com/pandas-dev/pandas/issues/11704" - raise NotImplementedError("See issue #11704 {url}".format(url=url)) - - def _get_index(self) -> Optional[np.ndarray]: - """ - Return integer representations as an ndarray if index is frequency. - - Returns - ------- - None or ndarray - """ - - if self.is_freq_type: - return self._on.asi8 - return None + raise NotImplementedError(f"See issue #11704 {url}") def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: """Convert input to numpy arrays for Cython routines""" @@ -232,15 +264,14 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: values = ensure_float64(values) elif needs_i8_conversion(values.dtype): raise NotImplementedError( - "ops for {action} for this " - "dtype {dtype} are not " - "implemented".format(action=self._window_type, dtype=values.dtype) + f"ops for {self._window_type} for this " + f"dtype {values.dtype} are not implemented" ) else: try: values = ensure_float64(values) except (ValueError, TypeError): - raise TypeError("cannot handle this type -> {0}".format(values.dtype)) + raise TypeError(f"cannot handle this type -> {values.dtype}") # Convert inf to nan for C funcs inf = np.isinf(values) @@ -260,17 +291,6 @@ def _wrap_result(self, result, block=None, obj=None): if isinstance(result, np.ndarray): - # coerce if necessary - if block is not None: - if is_timedelta64_dtype(block.values.dtype): - # TODO: do we know what result.dtype is at this point? - # i.e. can we just do an astype? - from pandas import to_timedelta - - result = to_timedelta(result.ravel(), unit="ns").values.reshape( - result.shape - ) - if result.ndim == 1: from pandas import Series @@ -339,50 +359,63 @@ def _center_window(self, result, window) -> np.ndarray: if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument dimensions") - offset = _offset(window, True) + offset = calculate_center_offset(window) if offset > 0: - if isinstance(result, (ABCSeries, ABCDataFrame)): - result = result.slice_shift(-offset, axis=self.axis) - else: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) return result - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs - ) -> Callable: + def _get_roll_func(self, func_name: str) -> Callable: """ Wrap rolling function to check values passed. Parameters ---------- - cfunc : callable + func_name : str Cython function used to calculate rolling statistics - check_minp : callable - function to check minimum period parameter - index : ndarray - used for variable window Returns ------- func : callable """ + window_func = getattr(window_aggregations, func_name, None) + if window_func is None: + raise ValueError( + f"we do not support this function in window_aggregations.{func_name}" + ) + return window_func - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, window) - return cfunc(arg, window, minp, index, closed, **kwargs) + def _get_cython_func_type(self, func: str) -> Callable: + """ + Return a variable or fixed cython function type. - return func + Variable algorithms do not use window while fixed do. + """ + if self.is_freq_type or isinstance(self.window, BaseIndexer): + return self._get_roll_func(f"{func}_variable") + return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) + + def _get_window_indexer(self, window: int) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + if isinstance(self.window, BaseIndexer): + return self.window + if self.is_freq_type: + return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) + return FixedWindowIndexer(window_size=window) def _apply( self, - func: Union[str, Callable], + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, name: Optional[str] = None, - window: Optional[Union[int, str]] = None, - center: Optional[bool] = None, - check_minp: Optional[Callable] = None, - **kwargs + use_numba_cache: bool = False, + **kwargs, ): """ Rolling statistical measure using supplied function. @@ -391,13 +424,16 @@ def _apply( Parameters ---------- - func : str/callable to apply - name : str, optional - name of this function - window : int/str, default to _get_window() - window length or offset - center : bool, default to self.center - check_minp : function, default to _use_window + func : callable function to apply + center : bool + require_min_periods : int + floor : int + is_weighted : bool + name : str, + compatibility with groupby.rolling + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function @@ -405,21 +441,15 @@ def _apply( ------- y : type of input """ - if center is None: - center = self.center - - if check_minp is None: - check_minp = _use_window - - if window is None: - window = self._get_window(**kwargs) + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) blocks, obj = self._create_blocks() block_list = list(blocks) - index_as_array = self._get_index() + window_indexer = self._get_window_indexer(window) results = [] - exclude = [] # type: List[Scalar] + exclude: List[Scalar] = [] for i, b in enumerate(blocks): try: values = self._prep_values(b.values) @@ -436,36 +466,39 @@ def _apply( results.append(values.copy()) continue - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(libwindow, func, None) - if cfunc is None: - raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) - ) - - func = self._get_roll_func(cfunc, check_minp, index_as_array, **kwargs) - # calculation function - if center: - offset = _offset(window, center) - additional_nans = np.array([np.NaN] * offset) + offset = calculate_center_offset(window) if center else 0 + additional_nans = np.array([np.nan] * offset) + + if not is_weighted: def calc(x): - return func( - np.concatenate((x, additional_nans)), - window, + x = np.concatenate((x, additional_nans)) + if not isinstance(window, BaseIndexer): + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor + ) + else: + min_periods = calculate_min_periods( + self.min_periods or 1, + self.min_periods, + len(x), + require_min_periods, + floor, + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x), min_periods=self.min_periods, + center=self.center, closed=self.closed, ) + return func(x, start, end, min_periods) else: def calc(x): - return func( - x, window, min_periods=self.min_periods, closed=self.closed - ) + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -474,6 +507,9 @@ def calc(x): result = calc(values) result = np.asarray(result) + if use_numba_cache: + self._numba_func_cache[name] = func + if center: result = self._center_window(result, window) @@ -612,6 +648,126 @@ def aggregate(self, func, *args, **kwargs): """ ) + _shared_docs["var"] = dedent( + """ + Calculate unbiased %(name)s variance. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.var : Equivalent method for Series. + DataFrame.var : Equivalent method for DataFrame. + numpy.var : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in :meth:`Series.var` is different than the + default `ddof` of 0 in :func:`numpy.var`. + + A minimum of 1 period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 1.000000 + 4 1.000000 + 5 1.333333 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).var() + 0 NaN + 1 NaN + 2 0.333333 + 3 0.916667 + 4 0.800000 + 5 0.700000 + 6 0.619048 + dtype: float64 + """ + ) + + _shared_docs["std"] = dedent( + """ + Calculate %(name)s standard deviation. + %(versionadded)s + Normalized by N-1 by default. This can be changed using the `ddof` + argument. + + Parameters + ---------- + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returns the same object type as the caller of the %(name)s calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data. + DataFrame.%(name)s : Calling object with DataFrames. + Series.std : Equivalent method for Series. + DataFrame.std : Equivalent method for DataFrame. + numpy.std : Equivalent method for Numpy array. + + Notes + ----- + The default `ddof` of 1 used in Series.std is different than the default + `ddof` of 0 in numpy.std. + + A minimum of one period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) + >>> s.rolling(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 1.000000 + 4 1.000000 + 5 1.154701 + 6 0.000000 + dtype: float64 + + >>> s.expanding(3).std() + 0 NaN + 1 NaN + 2 0.577350 + 3 0.957427 + 4 0.894427 + 5 0.836660 + 6 0.786796 + dtype: float64 + """ + ) + class Window(_Window): """ @@ -619,13 +775,18 @@ class Window(_Window): Parameters ---------- - window : int, or offset + window : int, offset, or BaseIndexer subclass Size of the moving window. This is the number of observations used for calculating the statistic. Each window will be a fixed size. If its an offset then this will be the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetimelike indexes. + + If a BaseIndexer subclass is passed, calculates the window boundaries + based on the defined ``get_window_bounds`` method. Additional rolling + keyword arguments, namely `min_periods`, `center`, and + `closed` will be passed to `get_window_bounds`. min_periods : int, default None Minimum number of observations in window required to have a value (otherwise result is NA). For a window that is specified by an offset, @@ -664,7 +825,7 @@ class Window(_Window): changed to the center of the window by setting ``center=True``. To learn more about the offsets & frequency strings, please see `this link - `__. + `__. The recognized win_types are: @@ -711,6 +872,17 @@ class Window(_Window): 3 NaN 4 NaN + Rolling sum with a window length of 2, using the 'gaussian' + window type (note how we need to specify std). + + >>> df.rolling(2, win_type='gaussian').sum(std=3) + B + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN + Rolling sum with a window length of 2, min_periods defaults to the window length. @@ -766,7 +938,11 @@ def validate(self): super().validate() window = self.window - if isinstance(window, (list, tuple, np.ndarray)): + if isinstance(window, BaseIndexer): + raise NotImplementedError( + "BaseIndexer subclasses not implemented with win_types." + ) + elif isinstance(window, (list, tuple, np.ndarray)): pass elif is_integer(window): if window <= 0: @@ -777,21 +953,68 @@ def validate(self): import scipy.signal as sig if not isinstance(self.win_type, str): - raise ValueError("Invalid win_type {0}".format(self.win_type)) + raise ValueError(f"Invalid win_type {self.win_type}") if getattr(sig, self.win_type, None) is None: - raise ValueError("Invalid win_type {0}".format(self.win_type)) + raise ValueError(f"Invalid win_type {self.win_type}") else: - raise ValueError("Invalid window {0}".format(window)) + raise ValueError(f"Invalid window {window}") - def _get_window(self, other=None, **kwargs) -> np.ndarray: + def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: + """ + Extract arguments for the window type, provide validation for it + and return the validated window type. + + Parameters + ---------- + kwargs : dict + + Returns + ------- + win_type : str, or tuple + """ + # the below may pop from kwargs + def _validate_win_type(win_type, kwargs): + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } + + if win_type in arg_map: + win_args = _pop_args(win_type, arg_map[win_type], kwargs) + if win_type == "exponential": + # exponential window requires the first arg (center) + # to be set to None (necessary for symmetric window) + win_args.insert(0, None) + + return tuple([win_type] + win_args) + + return win_type + + def _pop_args(win_type, arg_names, kwargs): + all_args = [] + for n in arg_names: + if n not in kwargs: + raise ValueError(f"{win_type} window requires {n}") + all_args.append(kwargs.pop(n)) + return all_args + + return _validate_win_type(self.win_type, kwargs) + + def _get_window( + self, other=None, win_type: Optional[Union[str, Tuple]] = None + ) -> np.ndarray: """ - Provide validation for the window type, return the window - which has already been validated. + Get the window, weights. Parameters ---------- - other: + other : ignored, exists for compatibility + win_type : str, or tuple + type of window to create Returns ------- @@ -805,49 +1028,9 @@ def _get_window(self, other=None, **kwargs) -> np.ndarray: elif is_integer(window): import scipy.signal as sig - # the below may pop from kwargs - def _validate_win_type(win_type, kwargs): - arg_map = { - "kaiser": ["beta"], - "gaussian": ["std"], - "general_gaussian": ["power", "width"], - "slepian": ["width"], - "exponential": ["tau"], - } - - if win_type in arg_map: - win_args = _pop_args(win_type, arg_map[win_type], kwargs) - if win_type == "exponential": - # exponential window requires the first arg (center) - # to be set to None (necessary for symmetric window) - win_args.insert(0, None) - - return tuple([win_type] + win_args) - - return win_type - - def _pop_args(win_type, arg_names, kwargs): - msg = "%s window requires %%s" % win_type - all_args = [] - for n in arg_names: - if n not in kwargs: - raise ValueError(msg % n) - all_args.append(kwargs.pop(n)) - return all_args - - win_type = _validate_win_type(self.win_type, kwargs) # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - def _get_roll_func( - self, cfunc: Callable, check_minp: Callable, index: np.ndarray, **kwargs - ) -> Callable: - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, len(window)) - return cfunc(arg, window, minp) - - return func - _agg_see_also_doc = dedent( """ See Also @@ -914,13 +1097,38 @@ def aggregate(self, func, *args, **kwargs): @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_weighted_sum", **kwargs) + window_func = self._get_roll_func("roll_weighted_sum") + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="sum", **kwargs + ) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_weighted_mean", **kwargs) + window_func = self._get_roll_func("roll_weighted_mean") + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="mean", **kwargs + ) + + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func("var", args, kwargs) + window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) + window_func = get_weighted_roll_func(window_func) + kwargs.pop("name", None) + return self._apply( + window_func, center=self.center, is_weighted=True, name="var", **kwargs + ) + + @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func("std", args, kwargs) + return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class _Rolling(_Window): @@ -974,8 +1182,6 @@ class _Rolling_and_Expanding(_Rolling): def count(self): blocks, obj = self._create_blocks() - # Validate the index - self._get_index() window = self._get_window() window = min(window, len(obj)) if not self.center else window @@ -1003,21 +1209,39 @@ def count(self): ---------- func : function Must produce a single value from an ndarray input if ``raw=True`` - or a single value from a Series if ``raw=False``. + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + raw : bool, default None * ``False`` : passes each row or column as a Series to the function. - * ``True`` or ``None`` : the passed function will receive ndarray + * ``True`` : the passed function will receive ndarray objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - - The `raw` parameter is required and will show a FutureWarning if - not passed. In the future `raw` will default to False. - - .. versionadded:: 0.23.0 - *args, **kwargs - Arguments and keyword arguments to be passed into func. + engine : str, default 'cython' + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + + .. versionadded:: 1.0.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + + args : tuple, default None + Positional arguments to be passed into func. + kwargs : dict, default None + Keyword arguments to be passed into func. Returns ------- @@ -1028,53 +1252,89 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. + + Notes + ----- + See :ref:`stats.rolling_apply` for extended documentation and performance + considerations for the Numba engine. """ ) - def apply(self, func, raw=None, args=(), kwargs={}): - from pandas import Series - + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + ): + if args is None: + args = () + if kwargs is None: + kwargs = {} kwargs.pop("_level", None) + kwargs.pop("floor", None) window = self._get_window() - offset = _offset(window, self.center) - index_as_array = self._get_index() - - # TODO: default is for backward compat - # change to False in the future - if raw is None: - warnings.warn( - "Currently, 'apply' passes the values as ndarrays to the " - "applied function. In the future, this will change to passing " - "it as Series objects. You need to specify 'raw=True' to keep " - "the current behaviour, and you can pass 'raw=False' to " - "silence this warning", - FutureWarning, - stacklevel=3, + offset = calculate_center_offset(window) if self.center else 0 + if not is_bool(raw): + raise ValueError("raw parameter must be `True` or `False`") + + if engine == "cython": + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func ) - raw = True + elif engine == "numba": + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + if func in self._numba_func_cache: + # Return an already compiled version of roll_apply if available + apply_func = self._numba_func_cache[func] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply( + apply_func, + center=False, + floor=0, + name=func, + use_numba_cache=engine == "numba", + ) + + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + from pandas import Series - def f(arg, window, min_periods, closed): - minp = _use_window(min_periods, window) + window_func = partial( + self._get_cython_func_type("roll_generic"), + args=args, + kwargs=kwargs, + raw=raw, + offset=offset, + func=func, + ) + + def apply_func(values, begin, end, min_periods, raw=raw): if not raw: - arg = Series(arg, index=self.obj.index) - return libwindow.roll_generic( - arg, - window, - minp, - index_as_array, - closed, - offset, - func, - raw, - args, - kwargs, - ) + values = Series(values, index=self.obj.index) + return window_func(values, begin, end, min_periods) - return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) + return apply_func def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - return self._apply("roll_sum", "sum", **kwargs) + window_func = self._get_cython_func_type("roll_sum") + kwargs.pop("floor", None) + return self._apply( + window_func, center=self.center, floor=0, name="sum", **kwargs + ) _shared_docs["max"] = dedent( """ @@ -1089,7 +1349,8 @@ def sum(self, *args, **kwargs): def max(self, *args, **kwargs): nv.validate_window_func("max", args, kwargs) - return self._apply("roll_max", "max", **kwargs) + window_func = self._get_cython_func_type("roll_max") + return self._apply(window_func, center=self.center, name="max", **kwargs) _shared_docs["min"] = dedent( """ @@ -1130,11 +1391,13 @@ def max(self, *args, **kwargs): def min(self, *args, **kwargs): nv.validate_window_func("min", args, kwargs) - return self._apply("roll_min", "min", **kwargs) + window_func = self._get_cython_func_type("roll_min") + return self._apply(window_func, center=self.center, name="min", **kwargs) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - return self._apply("roll_mean", "mean", **kwargs) + window_func = self._get_cython_func_type("roll_mean") + return self._apply(window_func, center=self.center, name="mean", **kwargs) _shared_docs["median"] = dedent( """ @@ -1174,147 +1437,40 @@ def mean(self, *args, **kwargs): ) def median(self, **kwargs): - return self._apply("roll_median_c", "median", **kwargs) - - _shared_docs["std"] = dedent( - """ - Calculate %(name)s standard deviation. - - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - Series.%(name)s : Calling object with Series data. - DataFrame.%(name)s : Calling object with DataFrames. - Series.std : Equivalent method for Series. - DataFrame.std : Equivalent method for DataFrame. - numpy.std : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in Series.std is different than the default - `ddof` of 0 in numpy.std. - - A minimum of one period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 1.000000 - 4 1.000000 - 5 1.154701 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).std() - 0 NaN - 1 NaN - 2 0.577350 - 3 0.957427 - 4 0.894427 - 5 0.836660 - 6 0.786796 - dtype: float64 - """ - ) + window_func = self._get_roll_func("roll_median_c") + window_func = partial(window_func, win=self._get_window()) + return self._apply(window_func, center=self.center, name="median", **kwargs) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - window = self._get_window() - index_as_array = self._get_index() + kwargs.pop("require_min_periods", None) + window_func = self._get_cython_func_type("roll_var") - def f(arg, *args, **kwargs): - minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt( - libwindow.roll_var(arg, window, minp, index_as_array, self.closed, ddof) - ) + def zsqrt_func(values, begin, end, min_periods): + return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + # ddof passed again for compat with groupby.rolling return self._apply( - f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + zsqrt_func, + center=self.center, + require_min_periods=1, + name="std", + ddof=ddof, + **kwargs, ) - _shared_docs["var"] = dedent( - """ - Calculate unbiased %(name)s variance. - - Normalized by N-1 by default. This can be changed using the `ddof` - argument. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller of the %(name)s calculation. - - See Also - -------- - Series.%(name)s : Calling object with Series data. - DataFrame.%(name)s : Calling object with DataFrames. - Series.var : Equivalent method for Series. - DataFrame.var : Equivalent method for DataFrame. - numpy.var : Equivalent method for Numpy array. - - Notes - ----- - The default `ddof` of 1 used in :meth:`Series.var` is different than the - default `ddof` of 0 in :func:`numpy.var`. - - A minimum of 1 period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5]) - >>> s.rolling(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 1.000000 - 4 1.000000 - 5 1.333333 - 6 0.000000 - dtype: float64 - - >>> s.expanding(3).var() - 0 NaN - 1 NaN - 2 0.333333 - 3 0.916667 - 4 0.800000 - 5 0.700000 - 6 0.619048 - dtype: float64 - """ - ) - def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) + kwargs.pop("require_min_periods", None) + window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) + # ddof passed again for compat with groupby.rolling return self._apply( - "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + window_func, + center=self.center, + require_min_periods=1, + name="var", + ddof=ddof, + **kwargs, ) _shared_docs[ @@ -1329,8 +1485,14 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): + window_func = self._get_cython_func_type("roll_skew") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs + window_func, + center=self.center, + require_min_periods=3, + name="skew", + **kwargs, ) _shared_docs["kurt"] = dedent( @@ -1366,8 +1528,14 @@ def skew(self, **kwargs): ) def kurt(self, **kwargs): + window_func = self._get_cython_func_type("roll_kurt") + kwargs.pop("require_min_periods", None) return self._apply( - "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs + window_func, + center=self.center, + require_min_periods=4, + name="kurt", + **kwargs, ) _shared_docs["quantile"] = dedent( @@ -1390,7 +1558,7 @@ def kurt(self, **kwargs): * higher: `j`. * nearest: `i` or `j` whichever is nearest. * midpoint: (`i` + `j`) / 2. - **kwargs: + **kwargs For compatibility with other %(name)s methods. Has no effect on the result. @@ -1427,31 +1595,22 @@ def kurt(self, **kwargs): ) def quantile(self, quantile, interpolation="linear", **kwargs): - window = self._get_window() - index_as_array = self._get_index() - - def f(arg, *args, **kwargs): - minp = _use_window(self.min_periods, window) - if quantile == 1.0: - return libwindow.roll_max( - arg, window, minp, index_as_array, self.closed - ) - elif quantile == 0.0: - return libwindow.roll_min( - arg, window, minp, index_as_array, self.closed - ) - else: - return libwindow.roll_quantile( - arg, - window, - minp, - index_as_array, - self.closed, - quantile, - interpolation, - ) + if quantile == 1.0: + window_func = self._get_cython_func_type("roll_max") + elif quantile == 0.0: + window_func = self._get_cython_func_type("roll_min") + else: + window_func = partial( + self._get_roll_func("roll_quantile"), + win=self._get_window(), + quantile=quantile, + interpolation=interpolation, + ) - return self._apply(f, "quantile", quantile=quantile, **kwargs) + # Pass through for groupby.rolling + kwargs["quantile"] = quantile + kwargs["interpolation"] = interpolation + return self._apply(window_func, center=self.center, name="quantile", **kwargs) _shared_docs[ "cov" @@ -1566,12 +1725,11 @@ def _get_cov(X, Y): >>> v1 = [3, 3, 3, 5, 8] >>> v2 = [3, 4, 4, 4, 8] - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits >>> # numpy returns a 2X2 array, the correlation coefficient >>> # is the number at entry [0][1] - >>> print(fmt.format(np.corrcoef(v1[:-1], v2[:-1])[0][1])) + >>> print(f"{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}") 0.333333 - >>> print(fmt.format(np.corrcoef(v1[1:], v2[1:])[0][1])) + >>> print(f"{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}") 0.916949 >>> s1 = pd.Series(v1) >>> s2 = pd.Series(v2) @@ -1642,17 +1800,18 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): @cache_readonly - def is_datetimelike(self): + def is_datetimelike(self) -> bool: return isinstance( self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) ) @cache_readonly - def _on(self): + def _on(self) -> Index: if self.on is None: if self.axis == 0: return self.obj.index - elif self.axis == 1: + else: + # i.e. self.axis == 1 return self.obj.columns elif isinstance(self.on, Index): return self.on @@ -1660,9 +1819,9 @@ def _on(self): return Index(self.obj[self.on]) else: raise ValueError( - "invalid on specified as {0}, " + f"invalid on specified as {self.on}, " "must be a column (of DataFrame), an Index " - "or None".format(self.on) + "or None" ) def validate(self): @@ -1693,6 +1852,9 @@ def validate(self): if self.min_periods is None: self.min_periods = 1 + elif isinstance(self.window, BaseIndexer): + # Passed BaseIndexer subclass should handle all other rolling kwargs + return elif not is_integer(self.window): raise ValueError("window must be an integer") elif self.window < 0: @@ -1711,7 +1873,7 @@ def _validate_monotonic(self): formatted = self.on if self.on is None: formatted = "index" - raise ValueError("{0} must be monotonic".format(formatted)) + raise ValueError(f"{formatted} must be monotonic") def _validate_freq(self): """ @@ -1723,9 +1885,9 @@ def _validate_freq(self): return to_offset(self.window) except (TypeError, ValueError): raise ValueError( - "passed window {0} is not " + f"passed window {self.window} is not " "compatible with a datetimelike " - "index".format(self.window) + "index" ) _agg_see_also_doc = dedent( @@ -1803,14 +1965,30 @@ def count(self): # different impl for freq counting if self.is_freq_type: - return self._apply("roll_count", "count") + window_func = self._get_roll_func("roll_count") + return self._apply(window_func, center=self.center, name="count") return super().count() @Substitution(name="rolling") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=None, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="rolling") @Appender(_shared_docs["sum"]) @@ -1842,13 +2020,13 @@ def mean(self, *args, **kwargs): def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name="rolling") + @Substitution(name="rolling", versionadded="") @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name="rolling") + @Substitution(name="rolling", versionadded="") @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): nv.validate_rolling_func("var", args, kwargs) @@ -1869,11 +2047,10 @@ def skew(self, **kwargs): four matching the equivalent function call using `scipy.stats`. >>> arr = [1, 2, 3, 4, 999] - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits >>> import scipy.stats - >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + >>> print(f"{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}") -1.200000 - >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False))) + >>> print(f"{scipy.stats.kurtosis(arr[1:], bias=False):.6f}") 3.999946 >>> s = pd.Series(arr) >>> s.rolling(4).kurt() @@ -1914,7 +2091,7 @@ def corr(self, other=None, pairwise=None, **kwargs): Rolling.__doc__ = Window.__doc__ -class RollingGroupby(_GroupByMixin, Rolling): +class RollingGroupby(WindowGroupByMixin, Rolling): """ Provide a rolling groupby implementation. """ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 883af5c2e62f0..ebe9a3d5bf472 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -156,28 +156,29 @@ class MergeError(ValueError): class AccessorRegistrationWarning(Warning): - """Warning for attribute conflicts in accessor registration.""" + """ + Warning for attribute conflicts in accessor registration. + """ class AbstractMethodError(NotImplementedError): - """Raise this error instead of NotImplementedError for abstract methods + """ + Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. """ def __init__(self, class_instance, methodtype="method"): types = {"method", "classmethod", "staticmethod", "property"} if methodtype not in types: - msg = "methodtype must be one of {}, got {} instead.".format( - methodtype, types + raise ValueError( + f"methodtype must be one of {methodtype}, got {types} instead." ) - raise ValueError(msg) self.methodtype = methodtype self.class_instance = class_instance - def __str__(self): + def __str__(self) -> str: if self.methodtype == "classmethod": name = self.class_instance.__name__ else: - name = self.class_instance.__class__.__name__ - msg = "This {methodtype} must be defined in the concrete class {name}" - return msg.format(methodtype=self.methodtype, name=name) + name = type(self.class_instance).__name__ + return f"This {self.methodtype} must be defined in the concrete class {name}" diff --git a/pandas/io/api.py b/pandas/io/api.py index 725e82604ca7f..2d25ffe5f8a6b 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -10,7 +10,7 @@ from pandas.io.gbq import read_gbq from pandas.io.html import read_html from pandas.io.json import read_json -from pandas.io.packers import read_msgpack, to_msgpack +from pandas.io.orc import read_orc from pandas.io.parquet import read_parquet from pandas.io.parsers import read_csv, read_fwf, read_table from pandas.io.pickle import read_pickle, to_pickle diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index caa928731fb3a..f808b7e706afb 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -1,7 +1,8 @@ """ Pyperclip -A cross-platform clipboard module for Python. (only handles plain text for now) +A cross-platform clipboard module for Python, +with copy & paste functions for plain text. By Al Sweigart al@inventwithpython.com BSD License @@ -10,102 +11,584 @@ pyperclip.copy('The text to be copied to the clipboard.') spam = pyperclip.paste() - if not pyperclip.copy: + if not pyperclip.is_available(): print("Copy functionality unavailable!") On Windows, no additional modules are needed. -On Mac, the module uses pbcopy and pbpaste, which should come with the os. +On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli + commands. (These commands should come with OS X.). On Linux, install xclip or xsel via package manager. For example, in Debian: -sudo apt-get install xclip + sudo apt-get install xclip + sudo apt-get install xsel -Otherwise on Linux, you will need the qtpy or PyQt modules installed. -qtpy also requires a python-qt-bindings module: PyQt4, PyQt5, PySide, PySide2 +Otherwise on Linux, you will need the PyQt5 modules installed. This module does not work with PyGObject yet. + +Cygwin is currently not supported. + +Security Note: This module runs programs with these names: + - which + - where + - pbcopy + - pbpaste + - xclip + - xsel + - klipper + - qdbus +A malicious user could rename or add programs with these names, tricking +Pyperclip into running them with whatever permissions the Python process has. + """ -__version__ = "1.5.27" +__version__ = "1.7.0" +import contextlib +import ctypes +from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof import os import platform import subprocess +import time +import warnings -from .clipboards import ( - init_klipper_clipboard, - init_no_clipboard, - init_osx_clipboard, - init_qt_clipboard, - init_xclip_clipboard, - init_xsel_clipboard, -) -from .windows import init_windows_clipboard - -# `import qtpy` sys.exit()s if DISPLAY is not in the environment. +# `import PyQt4` sys.exit()s if DISPLAY is not in the environment. # Thus, we need to detect the presence of $DISPLAY manually -# and not load qtpy if it is absent. +# and not load PyQt4 if it is absent. HAS_DISPLAY = os.getenv("DISPLAY", False) -CHECK_CMD = "where" if platform.system() == "Windows" else "which" + +EXCEPT_MSG = """ + Pyperclip could not find a copy/paste mechanism for your system. + For more information, please visit + https://pyperclip.readthedocs.io/en/latest/introduction.html#not-implemented-error + """ + +ENCODING = "utf-8" + +# The "which" unix command finds where a command is. +if platform.system() == "Windows": + WHICH_CMD = "where" +else: + WHICH_CMD = "which" def _executable_exists(name): return ( subprocess.call( - [CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) == 0 ) +# Exceptions +class PyperclipException(RuntimeError): + pass + + +class PyperclipWindowsException(PyperclipException): + def __init__(self, message): + message += f" ({ctypes.WinError()})" + super().__init__(message) + + +def _stringifyText(text) -> str: + acceptedTypes = (str, int, float, bool) + if not isinstance(text, acceptedTypes): + raise PyperclipException( + f"only str, int, float, and bool values " + f"can be copied to the clipboard, not {type(text).__name__}" + ) + return str(text) + + +def init_osx_pbcopy_clipboard(): + def copy_osx_pbcopy(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_osx_pbcopy(): + p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) + stdout, stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_osx_pbcopy, paste_osx_pbcopy + + +def init_osx_pyobjc_clipboard(): + def copy_osx_pyobjc(text): + """Copy string argument to clipboard""" + text = _stringifyText(text) # Converts non-str values to str. + newStr = Foundation.NSString.stringWithString_(text).nsstring() + newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding) + board = AppKit.NSPasteboard.generalPasteboard() + board.declareTypes_owner_([AppKit.NSStringPboardType], None) + board.setData_forType_(newData, AppKit.NSStringPboardType) + + def paste_osx_pyobjc(): + "Returns contents of clipboard" + board = AppKit.NSPasteboard.generalPasteboard() + content = board.stringForType_(AppKit.NSStringPboardType) + return content + + return copy_osx_pyobjc, paste_osx_pyobjc + + +def init_qt_clipboard(): + global QApplication + # $DISPLAY should exist + + # Try to import from qtpy, but if that fails try PyQt5 then PyQt4 + try: + from qtpy.QtWidgets import QApplication + except ImportError: + try: + from PyQt5.QtWidgets import QApplication + except ImportError: + from PyQt4.QtGui import QApplication + + app = QApplication.instance() + if app is None: + app = QApplication([]) + + def copy_qt(text): + text = _stringifyText(text) # Converts non-str values to str. + cb = app.clipboard() + cb.setText(text) + + def paste_qt() -> str: + cb = app.clipboard() + return str(cb.text()) + + return copy_qt, paste_qt + + +def init_xclip_clipboard(): + DEFAULT_SELECTION = "c" + PRIMARY_SELECTION = "p" + + def copy_xclip(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + selection = DEFAULT_SELECTION + if primary: + selection = PRIMARY_SELECTION + p = subprocess.Popen( + ["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode(ENCODING)) + + def paste_xclip(primary=False): + selection = DEFAULT_SELECTION + if primary: + selection = PRIMARY_SELECTION + p = subprocess.Popen( + ["xclip", "-selection", selection, "-o"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + # Intentionally ignore extraneous output on stderr when clipboard is empty + return stdout.decode(ENCODING) + + return copy_xclip, paste_xclip + + +def init_xsel_clipboard(): + DEFAULT_SELECTION = "-b" + PRIMARY_SELECTION = "-p" + + def copy_xsel(text, primary=False): + text = _stringifyText(text) # Converts non-str values to str. + selection_flag = DEFAULT_SELECTION + if primary: + selection_flag = PRIMARY_SELECTION + p = subprocess.Popen( + ["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode(ENCODING)) + + def paste_xsel(primary=False): + selection_flag = DEFAULT_SELECTION + if primary: + selection_flag = PRIMARY_SELECTION + p = subprocess.Popen( + ["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True + ) + stdout, stderr = p.communicate() + return stdout.decode(ENCODING) + + return copy_xsel, paste_xsel + + +def init_klipper_clipboard(): + def copy_klipper(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen( + [ + "qdbus", + "org.kde.klipper", + "/klipper", + "setClipboardContents", + text.encode(ENCODING), + ], + stdin=subprocess.PIPE, + close_fds=True, + ) + p.communicate(input=None) + + def paste_klipper(): + p = subprocess.Popen( + ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"], + stdout=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + + # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 + # TODO: https://github.com/asweigart/pyperclip/issues/43 + clipboardContents = stdout.decode(ENCODING) + # even if blank, Klipper will append a newline at the end + assert len(clipboardContents) > 0 + # make sure that newline is there + assert clipboardContents.endswith("\n") + if clipboardContents.endswith("\n"): + clipboardContents = clipboardContents[:-1] + return clipboardContents + + return copy_klipper, paste_klipper + + +def init_dev_clipboard_clipboard(): + def copy_dev_clipboard(text): + text = _stringifyText(text) # Converts non-str values to str. + if text == "": + warnings.warn( + "Pyperclip cannot copy a blank string to the clipboard on Cygwin." + "This is effectively a no-op." + ) + if "\r" in text: + warnings.warn("Pyperclip cannot handle \\r characters on Cygwin.") + + with open("/dev/clipboard", "wt") as fo: + fo.write(text) + + def paste_dev_clipboard() -> str: + with open("/dev/clipboard", "rt") as fo: + content = fo.read() + return content + + return copy_dev_clipboard, paste_dev_clipboard + + +def init_no_clipboard(): + class ClipboardUnavailable: + def __call__(self, *args, **kwargs): + raise PyperclipException(EXCEPT_MSG) + + def __bool__(self) -> bool: + return False + + return ClipboardUnavailable(), ClipboardUnavailable() + + +# Windows-related clipboard functions: +class CheckedCall: + def __init__(self, f): + super().__setattr__("f", f) + + def __call__(self, *args): + ret = self.f(*args) + if not ret and get_errno(): + raise PyperclipWindowsException("Error calling " + self.f.__name__) + return ret + + def __setattr__(self, key, value): + setattr(self.f, key, value) + + +def init_windows_clipboard(): + global HGLOBAL, LPVOID, DWORD, LPCSTR, INT + global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE + from ctypes.wintypes import ( + HGLOBAL, + LPVOID, + DWORD, + LPCSTR, + INT, + HWND, + HINSTANCE, + HMENU, + BOOL, + UINT, + HANDLE, + ) + + windll = ctypes.windll + msvcrt = ctypes.CDLL("msvcrt") + + safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) + safeCreateWindowExA.argtypes = [ + DWORD, + LPCSTR, + LPCSTR, + DWORD, + INT, + INT, + INT, + INT, + HWND, + HMENU, + HINSTANCE, + LPVOID, + ] + safeCreateWindowExA.restype = HWND + + safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) + safeDestroyWindow.argtypes = [HWND] + safeDestroyWindow.restype = BOOL + + OpenClipboard = windll.user32.OpenClipboard + OpenClipboard.argtypes = [HWND] + OpenClipboard.restype = BOOL + + safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard) + safeCloseClipboard.argtypes = [] + safeCloseClipboard.restype = BOOL + + safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard) + safeEmptyClipboard.argtypes = [] + safeEmptyClipboard.restype = BOOL + + safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData) + safeGetClipboardData.argtypes = [UINT] + safeGetClipboardData.restype = HANDLE + + safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData) + safeSetClipboardData.argtypes = [UINT, HANDLE] + safeSetClipboardData.restype = HANDLE + + safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc) + safeGlobalAlloc.argtypes = [UINT, c_size_t] + safeGlobalAlloc.restype = HGLOBAL + + safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock) + safeGlobalLock.argtypes = [HGLOBAL] + safeGlobalLock.restype = LPVOID + + safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock) + safeGlobalUnlock.argtypes = [HGLOBAL] + safeGlobalUnlock.restype = BOOL + + wcslen = CheckedCall(msvcrt.wcslen) + wcslen.argtypes = [c_wchar_p] + wcslen.restype = UINT + + GMEM_MOVEABLE = 0x0002 + CF_UNICODETEXT = 13 + + @contextlib.contextmanager + def window(): + """ + Context that provides a valid Windows hwnd. + """ + # we really just need the hwnd, so setting "STATIC" + # as predefined lpClass is just fine. + hwnd = safeCreateWindowExA( + 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None + ) + try: + yield hwnd + finally: + safeDestroyWindow(hwnd) + + @contextlib.contextmanager + def clipboard(hwnd): + """ + Context manager that opens the clipboard and prevents + other applications from modifying the clipboard content. + """ + # We may not get the clipboard handle immediately because + # some other application is accessing it (?) + # We try for at least 500ms to get the clipboard. + t = time.time() + 0.5 + success = False + while time.time() < t: + success = OpenClipboard(hwnd) + if success: + break + time.sleep(0.01) + if not success: + raise PyperclipWindowsException("Error calling OpenClipboard") + + try: + yield + finally: + safeCloseClipboard() + + def copy_windows(text): + # This function is heavily based on + # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard + + text = _stringifyText(text) # Converts non-str values to str. + + with window() as hwnd: + # http://msdn.com/ms649048 + # If an application calls OpenClipboard with hwnd set to NULL, + # EmptyClipboard sets the clipboard owner to NULL; + # this causes SetClipboardData to fail. + # => We need a valid hwnd to copy something. + with clipboard(hwnd): + safeEmptyClipboard() + + if text: + # http://msdn.com/ms649051 + # If the hMem parameter identifies a memory object, + # the object must have been allocated using the + # function with the GMEM_MOVEABLE flag. + count = wcslen(text) + 1 + handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar)) + locked_handle = safeGlobalLock(handle) + + ctypes.memmove( + c_wchar_p(locked_handle), + c_wchar_p(text), + count * sizeof(c_wchar), + ) + + safeGlobalUnlock(handle) + safeSetClipboardData(CF_UNICODETEXT, handle) + + def paste_windows(): + with clipboard(None): + handle = safeGetClipboardData(CF_UNICODETEXT) + if not handle: + # GetClipboardData may return NULL with errno == NO_ERROR + # if the clipboard is empty. + # (Also, it may return a handle to an empty buffer, + # but technically that's not empty) + return "" + return c_wchar_p(handle).value + + return copy_windows, paste_windows + + +def init_wsl_clipboard(): + def copy_wsl(text): + text = _stringifyText(text) # Converts non-str values to str. + p = subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode(ENCODING)) + + def paste_wsl(): + p = subprocess.Popen( + ["powershell.exe", "-command", "Get-Clipboard"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + close_fds=True, + ) + stdout, stderr = p.communicate() + # WSL appends "\r\n" to the contents. + return stdout[:-2].decode(ENCODING) + + return copy_wsl, paste_wsl + + +# Automatic detection of clipboard mechanisms +# and importing is done in deteremine_clipboard(): def determine_clipboard(): - # Determine the OS/platform and set - # the copy() and paste() functions accordingly. - if "cygwin" in platform.system().lower(): + """ + Determine the OS/platform and set the copy() and paste() functions + accordingly. + """ + + global Foundation, AppKit, qtpy, PyQt4, PyQt5 + + # Setup for the CYGWIN platform: + if ( + "cygwin" in platform.system().lower() + ): # Cygwin has a variety of values returned by platform.system(), + # such as 'CYGWIN_NT-6.1' # FIXME: pyperclip currently does not support Cygwin, # see https://github.com/asweigart/pyperclip/issues/55 - pass + if os.path.exists("/dev/clipboard"): + warnings.warn( + "Pyperclip's support for Cygwin is not perfect," + "see https://github.com/asweigart/pyperclip/issues/55" + ) + return init_dev_clipboard_clipboard() + + # Setup for the WINDOWS platform: elif os.name == "nt" or platform.system() == "Windows": return init_windows_clipboard() + + if platform.system() == "Linux": + with open("/proc/version", "r") as f: + if "Microsoft" in f.read(): + return init_wsl_clipboard() + + # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": - return init_osx_clipboard() + try: + import Foundation # check if pyobjc is installed + import AppKit + except ImportError: + return init_osx_pbcopy_clipboard() + else: + return init_osx_pyobjc_clipboard() + + # Setup for the LINUX platform: if HAS_DISPLAY: - # Determine which command/module is installed, if any. + if _executable_exists("xsel"): + return init_xsel_clipboard() + if _executable_exists("xclip"): + return init_xclip_clipboard() + if _executable_exists("klipper") and _executable_exists("qdbus"): + return init_klipper_clipboard() + try: - # qtpy is a small abstraction layer that lets you write - # applications using a single api call to either PyQt or PySide - # https://pypi.org/project/QtPy - import qtpy # noqa + # qtpy is a small abstraction layer that lets you write applications + # using a single api call to either PyQt or PySide. + # https://pypi.python.org/project/QtPy + import qtpy # check if qtpy is installed except ImportError: - # If qtpy isn't installed, fall back on importing PyQt5, or PyQt5 + # If qtpy isn't installed, fall back on importing PyQt4. try: - import PyQt5 # noqa + import PyQt5 # check if PyQt5 is installed except ImportError: try: - import PyQt4 # noqa + import PyQt4 # check if PyQt4 is installed except ImportError: - pass # fail fast for all non-ImportError exceptions. + pass # We want to fail fast for all non-ImportError exceptions. else: return init_qt_clipboard() else: return init_qt_clipboard() - pass else: return init_qt_clipboard() - if _executable_exists("xclip"): - return init_xclip_clipboard() - if _executable_exists("xsel"): - return init_xsel_clipboard() - if _executable_exists("klipper") and _executable_exists("qdbus"): - return init_klipper_clipboard() - return init_no_clipboard() def set_clipboard(clipboard): + """ + Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how + the copy() and paste() functions interact with the operating system to + implement the copy/paste feature. The clipboard parameter must be one of: + - pbcopy + - pbobjc (default on Mac OS X) + - qt + - xclip + - xsel + - klipper + - windows (default on Windows) + - no (this is what is set when no clipboard mechanism can be found) + """ global copy, paste clipboard_types = { - "osx": init_osx_clipboard, - "qt": init_qt_clipboard, + "pbcopy": init_osx_pbcopy_clipboard, + "pyobjc": init_osx_pyobjc_clipboard, + "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5' "xclip": init_xclip_clipboard, "xsel": init_xsel_clipboard, "klipper": init_klipper_clipboard, @@ -113,13 +596,71 @@ def set_clipboard(clipboard): "no": init_no_clipboard, } + if clipboard not in clipboard_types: + allowed_clipboard_types = [repr(_) for _ in clipboard_types.keys()] + raise ValueError( + f"Argument must be one of {', '.join(allowed_clipboard_types)}" + ) + + # Sets pyperclip's copy() and paste() functions: copy, paste = clipboard_types[clipboard]() -copy, paste = determine_clipboard() +def lazy_load_stub_copy(text): + """ + A stub function for copy(), which will load the real copy() function when + called so that the real copy() function is used for later calls. + + This allows users to import pyperclip without having determine_clipboard() + automatically run, which will automatically select a clipboard mechanism. + This could be a problem if it selects, say, the memory-heavy PyQt4 module + but the user was just going to immediately call set_clipboard() to use a + different clipboard mechanism. + + The lazy loading this stub function implements gives the user a chance to + call set_clipboard() to pick another clipboard mechanism. Or, if the user + simply calls copy() or paste() without calling set_clipboard() first, + will fall back on whatever clipboard mechanism that determine_clipboard() + automatically chooses. + """ + global copy, paste + copy, paste = determine_clipboard() + return copy(text) + + +def lazy_load_stub_paste(): + """ + A stub function for paste(), which will load the real paste() function when + called so that the real paste() function is used for later calls. + + This allows users to import pyperclip without having determine_clipboard() + automatically run, which will automatically select a clipboard mechanism. + This could be a problem if it selects, say, the memory-heavy PyQt4 module + but the user was just going to immediately call set_clipboard() to use a + different clipboard mechanism. + + The lazy loading this stub function implements gives the user a chance to + call set_clipboard() to pick another clipboard mechanism. Or, if the user + simply calls copy() or paste() without calling set_clipboard() first, + will fall back on whatever clipboard mechanism that determine_clipboard() + automatically chooses. + """ + global copy, paste + copy, paste = determine_clipboard() + return paste() + + +def is_available() -> bool: + return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste + + +# Initially, copy() and paste() are set to lazy loading wrappers which will +# set `copy` and `paste` to real functions the first time they're used, unless +# set_clipboard() or determine_clipboard() is called first. +copy, paste = lazy_load_stub_copy, lazy_load_stub_paste -__all__ = ["copy", "paste"] +__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"] # pandas aliases clipboard_get = paste diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py deleted file mode 100644 index cb4ed8ed549d0..0000000000000 --- a/pandas/io/clipboard/clipboards.py +++ /dev/null @@ -1,129 +0,0 @@ -import subprocess - -from .exceptions import PyperclipException - -EXCEPT_MSG = """ - Pyperclip could not find a copy/paste mechanism for your system. - For more information, please visit https://pyperclip.readthedocs.org """ - - -def init_osx_clipboard(): - def copy_osx(text): - p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode("utf-8")) - - def paste_osx(): - p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) - stdout, stderr = p.communicate() - return stdout.decode("utf-8") - - return copy_osx, paste_osx - - -def init_qt_clipboard(): - # $DISPLAY should exist - - # Try to import from qtpy, but if that fails try PyQt5 then PyQt4 - try: - from qtpy.QtWidgets import QApplication - except ImportError: - try: - from PyQt5.QtWidgets import QApplication - except ImportError: - from PyQt4.QtGui import QApplication - - app = QApplication.instance() - if app is None: - app = QApplication([]) - - def copy_qt(text): - cb = app.clipboard() - cb.setText(text) - - def paste_qt(): - cb = app.clipboard() - return str(cb.text()) - - return copy_qt, paste_qt - - -def init_xclip_clipboard(): - def copy_xclip(text): - p = subprocess.Popen( - ["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True - ) - p.communicate(input=text.encode("utf-8")) - - def paste_xclip(): - p = subprocess.Popen( - ["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True - ) - stdout, stderr = p.communicate() - return stdout.decode("utf-8") - - return copy_xclip, paste_xclip - - -def init_xsel_clipboard(): - def copy_xsel(text): - p = subprocess.Popen( - ["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True - ) - p.communicate(input=text.encode("utf-8")) - - def paste_xsel(): - p = subprocess.Popen( - ["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True - ) - stdout, stderr = p.communicate() - return stdout.decode("utf-8") - - return copy_xsel, paste_xsel - - -def init_klipper_clipboard(): - def copy_klipper(text): - p = subprocess.Popen( - [ - "qdbus", - "org.kde.klipper", - "/klipper", - "setClipboardContents", - text.encode("utf-8"), - ], - stdin=subprocess.PIPE, - close_fds=True, - ) - p.communicate(input=None) - - def paste_klipper(): - p = subprocess.Popen( - ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"], - stdout=subprocess.PIPE, - close_fds=True, - ) - stdout, stderr = p.communicate() - - # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 - # TODO: https://github.com/asweigart/pyperclip/issues/43 - clipboardContents = stdout.decode("utf-8") - # even if blank, Klipper will append a newline at the end - assert len(clipboardContents) > 0 - # make sure that newline is there - assert clipboardContents.endswith("\n") - if clipboardContents.endswith("\n"): - clipboardContents = clipboardContents[:-1] - return clipboardContents - - return copy_klipper, paste_klipper - - -def init_no_clipboard(): - class ClipboardUnavailable: - def __call__(self, *args, **kwargs): - raise PyperclipException(EXCEPT_MSG) - - def __bool__(self): - return False - - return ClipboardUnavailable(), ClipboardUnavailable() diff --git a/pandas/io/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py deleted file mode 100644 index eaf5578b5cd1b..0000000000000 --- a/pandas/io/clipboard/exceptions.py +++ /dev/null @@ -1,11 +0,0 @@ -import ctypes - - -class PyperclipException(RuntimeError): - pass - - -class PyperclipWindowsException(PyperclipException): - def __init__(self, message): - message += " ({err})".format(err=ctypes.WinError()) - super().__init__(message) diff --git a/pandas/io/clipboard/windows.py b/pandas/io/clipboard/windows.py deleted file mode 100644 index 2935dfdc2ae19..0000000000000 --- a/pandas/io/clipboard/windows.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -This module implements clipboard handling on Windows using ctypes. -""" -import contextlib -import ctypes -from ctypes import c_size_t, c_wchar, c_wchar_p, get_errno, sizeof -import time - -from .exceptions import PyperclipWindowsException - - -class CheckedCall: - def __init__(self, f): - super().__setattr__("f", f) - - def __call__(self, *args): - ret = self.f(*args) - if not ret and get_errno(): - raise PyperclipWindowsException("Error calling " + self.f.__name__) - return ret - - def __setattr__(self, key, value): - setattr(self.f, key, value) - - -def init_windows_clipboard(): - from ctypes.wintypes import ( - HGLOBAL, - LPVOID, - DWORD, - LPCSTR, - INT, - HWND, - HINSTANCE, - HMENU, - BOOL, - UINT, - HANDLE, - ) - - windll = ctypes.windll - msvcrt = ctypes.CDLL("msvcrt") - - safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) - safeCreateWindowExA.argtypes = [ - DWORD, - LPCSTR, - LPCSTR, - DWORD, - INT, - INT, - INT, - INT, - HWND, - HMENU, - HINSTANCE, - LPVOID, - ] - safeCreateWindowExA.restype = HWND - - safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) - safeDestroyWindow.argtypes = [HWND] - safeDestroyWindow.restype = BOOL - - OpenClipboard = windll.user32.OpenClipboard - OpenClipboard.argtypes = [HWND] - OpenClipboard.restype = BOOL - - safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard) - safeCloseClipboard.argtypes = [] - safeCloseClipboard.restype = BOOL - - safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard) - safeEmptyClipboard.argtypes = [] - safeEmptyClipboard.restype = BOOL - - safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData) - safeGetClipboardData.argtypes = [UINT] - safeGetClipboardData.restype = HANDLE - - safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData) - safeSetClipboardData.argtypes = [UINT, HANDLE] - safeSetClipboardData.restype = HANDLE - - safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc) - safeGlobalAlloc.argtypes = [UINT, c_size_t] - safeGlobalAlloc.restype = HGLOBAL - - safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock) - safeGlobalLock.argtypes = [HGLOBAL] - safeGlobalLock.restype = LPVOID - - safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock) - safeGlobalUnlock.argtypes = [HGLOBAL] - safeGlobalUnlock.restype = BOOL - - wcslen = CheckedCall(msvcrt.wcslen) - wcslen.argtypes = [c_wchar_p] - wcslen.restype = UINT - - GMEM_MOVEABLE = 0x0002 - CF_UNICODETEXT = 13 - - @contextlib.contextmanager - def window(): - """ - Context that provides a valid Windows hwnd. - """ - # we really just need the hwnd, so setting "STATIC" - # as predefined lpClass is just fine. - hwnd = safeCreateWindowExA( - 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None - ) - try: - yield hwnd - finally: - safeDestroyWindow(hwnd) - - @contextlib.contextmanager - def clipboard(hwnd): - """ - Context manager that opens the clipboard and prevents - other applications from modifying the clipboard content. - """ - # We may not get the clipboard handle immediately because - # some other application is accessing it (?) - # We try for at least 500ms to get the clipboard. - t = time.time() + 0.5 - success = False - while time.time() < t: - success = OpenClipboard(hwnd) - if success: - break - time.sleep(0.01) - if not success: - raise PyperclipWindowsException("Error calling OpenClipboard") - - try: - yield - finally: - safeCloseClipboard() - - def copy_windows(text): - # This function is heavily based on - # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard - with window() as hwnd: - # http://msdn.com/ms649048 - # If an application calls OpenClipboard with hwnd set to NULL, - # EmptyClipboard sets the clipboard owner to NULL; - # this causes SetClipboardData to fail. - # => We need a valid hwnd to copy something. - with clipboard(hwnd): - safeEmptyClipboard() - - if text: - # http://msdn.com/ms649051 - # If the hMem parameter identifies a memory object, - # the object must have been allocated using the - # function with the GMEM_MOVEABLE flag. - count = wcslen(text) + 1 - handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar)) - locked_handle = safeGlobalLock(handle) - - ctypes.memmove( - c_wchar_p(locked_handle), - c_wchar_p(text), - count * sizeof(c_wchar), - ) - - safeGlobalUnlock(handle) - safeSetClipboardData(CF_UNICODETEXT, handle) - - def paste_windows(): - with clipboard(None): - handle = safeGetClipboardData(CF_UNICODETEXT) - if not handle: - # GetClipboardData may return NULL with errno == NO_ERROR - # if the clipboard is empty. - # (Also, it may return a handle to an empty buffer, - # but technically that's not empty) - return "" - return c_wchar_p(handle).value - - return copy_windows, paste_windows diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 518b940ec5da3..34e8e03d8771e 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -69,8 +69,8 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover kwargs["engine"] = "python" elif len(sep) > 1 and kwargs.get("engine") == "c": warnings.warn( - "read_clipboard with regex separator does not work" - " properly with c engine" + "read_clipboard with regex separator does not work " + "properly with c engine" ) return read_csv(StringIO(text), sep=sep, **kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index 0bef14e4999c7..771a302d647ec 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,27 +1,13 @@ """Common IO api utilities""" import bz2 -import codecs -import csv +from collections import abc import gzip from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import ( - IO, - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - TextIO, - Tuple, - Type, - Union, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union from urllib.parse import ( # noqa urlencode, urljoin, @@ -32,6 +18,7 @@ ) import zipfile +from pandas._typing import FilePathOrBuffer from pandas.compat import _get_lzma_file, _import_lzma from pandas.errors import ( # noqa AbstractMethodError, @@ -43,54 +30,14 @@ from pandas.core.dtypes.common import is_file_like -from pandas._typing import FilePathOrBuffer - lzma = _import_lzma() -# gh-12665: Alias for now and remove later. -CParserError = ParserError - -# common NA values -# no longer excluding inf representations -# '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = { - "-1.#IND", - "1.#QNAN", - "1.#IND", - "-1.#QNAN", - "#N/A N/A", - "#N/A", - "N/A", - "n/a", - "NA", - "#NA", - "NULL", - "null", - "NaN", - "-NaN", - "nan", - "-nan", - "", -} - _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -class BaseIterator: - """Subclass this and provide a "__next__()" method to obtain an iterator. - Useful only when the object being iterated is non-reusable (e.g. OK for a - parser, not for an in-memory table, yes for its iterator).""" - - def __iter__(self) -> "BaseIterator": - return self - - def __next__(self): - raise AbstractMethodError(self) - - -def _is_url(url) -> bool: +def is_url(url) -> bool: """ Check to see if a URL has a valid protocol. @@ -109,7 +56,7 @@ def _is_url(url) -> bool: def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr] + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -128,7 +75,7 @@ def _expand_user( return filepath_or_buffer -def _validate_header_arg(header) -> None: +def validate_header_arg(header) -> None: if isinstance(header, bool): raise TypeError( "Passing a bool to header is invalid. " @@ -138,8 +85,8 @@ def _validate_header_arg(header) -> None: ) -def _stringify_path( - filepath_or_buffer: FilePathOrBuffer[AnyStr] +def stringify_path( + filepath_or_buffer: FilePathOrBuffer[AnyStr], ) -> FilePathOrBuffer[AnyStr]: """Attempt to convert a path-like object to a string. @@ -219,9 +166,9 @@ def get_filepath_or_buffer( compression, str, should_close, bool) """ - filepath_or_buffer = _stringify_path(filepath_or_buffer) + filepath_or_buffer = stringify_path(filepath_or_buffer) - if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -249,8 +196,8 @@ def get_filepath_or_buffer( return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): - msg = "Invalid file path or buffer object type: {_type}" - raise ValueError(msg.format(_type=type(filepath_or_buffer))) + msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" + raise ValueError(msg) return filepath_or_buffer, None, compression, False @@ -276,7 +223,7 @@ def file_path_to_url(path: str) -> str: _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} -def _get_compression_method( +def get_compression_method( compression: Optional[Union[str, Mapping[str, str]]] ) -> Tuple[Optional[str], Dict[str, str]]: """ @@ -309,7 +256,7 @@ def _get_compression_method( return compression, compression_args -def _infer_compression( +def infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: """ @@ -343,7 +290,7 @@ def _infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings - filepath_or_buffer = _stringify_path(filepath_or_buffer) + filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression return None @@ -358,13 +305,13 @@ def _infer_compression( if compression in _compression_to_extension: return compression - msg = "Unrecognized compression type: {}".format(compression) + msg = f"Unrecognized compression type: {compression}" valid = ["infer", None] + sorted(_compression_to_extension) - msg += "\nValid compression types are {}".format(valid) + msg += f"\nValid compression types are {valid}" raise ValueError(msg) -def _get_handle( +def get_handle( path_or_buf, mode: str, encoding=None, @@ -418,16 +365,16 @@ def _get_handle( except ImportError: need_text_wrapping = BufferedIOBase # type: ignore - handles = list() # type: List[IO] + handles: List[IO] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string - path_or_buf = _stringify_path(path_or_buf) + path_or_buf = stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) - compression, compression_args = _get_compression_method(compression) + compression, compression_args = get_compression_method(compression) if is_path: - compression = _infer_compression(path_or_buf, compression) + compression = infer_compression(path_or_buf, compression) if compression: @@ -447,7 +394,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode, **compression_args) + zf = _BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -457,13 +404,11 @@ def _get_handle( if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: - raise ValueError( - "Zero files found in ZIP file {}".format(path_or_buf) - ) + raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError( "Multiple files found in ZIP file." - " Only one file per ZIP: {}".format(zip_names) + f" Only one file per ZIP: {zip_names}" ) # XZ Compression @@ -472,7 +417,7 @@ def _get_handle( # Unrecognized Compression else: - msg = "Unrecognized compression type: {}".format(compression) + msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) handles.append(f) @@ -500,7 +445,7 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - wrapped = MMapWrapper(f) + wrapped = _MMapWrapper(f) f.close() f = wrapped except Exception: @@ -513,7 +458,7 @@ def _get_handle( return f, handles -class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -528,7 +473,7 @@ def __init__( file: FilePathOrBuffer, mode: str, archive_name: Optional[str] = None, - **kwargs + **kwargs, ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") @@ -546,7 +491,7 @@ def closed(self): return self.fp is None -class MMapWrapper(BaseIterator): +class _MMapWrapper(abc.Iterator): """ Wrapper for the Python's mmap class so that it can be properly read in by Python's csv.reader class. @@ -565,7 +510,7 @@ def __init__(self, f: IO): def __getattr__(self, name: str): return getattr(self.mmap, name) - def __iter__(self) -> "MMapWrapper": + def __iter__(self) -> "_MMapWrapper": return self def __next__(self) -> str: @@ -581,37 +526,3 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline - - -class UTF8Recoder(BaseIterator): - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ - - def __init__(self, f: BinaryIO, encoding: str): - self.reader = codecs.getreader(encoding)(f) - - def read(self, bytes: int = -1) -> bytes: - return self.reader.read(bytes).encode("utf-8") - - def readline(self) -> bytes: - return self.reader.readline().encode("utf-8") - - def next(self) -> bytes: - return next(self.reader).encode("utf-8") - - def close(self): - self.reader.close() - - -# Keeping these class for now because it provides a necessary convenience -# for "dropping" the "encoding" argument from our I/O arguments when -# creating a Unicode I/O object. -def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): - return csv.reader(f, dialect=dialect, **kwds) - - -def UnicodeWriter( - f: TextIO, dialect: Type[csv.Dialect] = csv.excel, encoding: str = "utf-8", **kwds -): - return csv.writer(f, dialect=dialect, **kwds) diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index ab64bc14344f1..7fdca2d65b05d 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -57,8 +57,8 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: raise AssertionError( - "All columns must have the same length: {0}; " - "column {1} has length {2}".format(N, i, n) + f"All columns must have the same length: {N}; " + f"column {i} has length {n}" ) return N diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8574c9ad1d425..04015a08bce2f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,5 +1,4 @@ import abc -from collections import OrderedDict from datetime import date, datetime, timedelta from io import BytesIO import os @@ -7,20 +6,20 @@ from pandas._config import config +from pandas._libs.parsers import STR_NA_VALUES from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_kwarg +from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame from pandas.io.common import ( - _NA_VALUES, - _is_url, - _stringify_path, - _validate_header_arg, get_filepath_or_buffer, + is_url, + stringify_path, urlopen, + validate_header_arg, ) from pandas.io.excel._util import ( _fill_mi_header, @@ -41,7 +40,7 @@ Parameters ---------- -io : str, ExcelFile, xlrd.Book, path object or file-like object +io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.xlsx``. @@ -79,14 +78,7 @@ subset of data is selected with ``usecols``, index_col is based on the subset. usecols : int, str, list-like, or callable default None - Return a subset of the columns. - * If None, then parse all columns. - * If int, then indicates last column to be parsed. - - .. deprecated:: 0.24.0 - Pass in a list of int instead from 0 to `usecols` inclusive. - * If str, then indicates comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. @@ -98,6 +90,8 @@ * If callable, then evaluate each column name against it and parse the column if the callable returns ``True``. + Returns a subset of the columns according to behavior above. + .. versionadded:: 0.24.0 squeeze : bool, default False @@ -130,7 +124,7 @@ Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. @@ -165,8 +159,9 @@ result 'foo' If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + index will be returned unaltered as an object data type. If you don`t want to + parse some cells as date just change their type in Excel to "Text". + For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. date_parser : function, optional @@ -188,11 +183,6 @@ Comments out remainder of line. Pass a character or characters to this argument to indicate comments in the input file. Any data between the comment string and the end of the current line is ignored. -skip_footer : int, default 0 - Alias of `skipfooter`. - - .. deprecated:: 0.23.0 - Use `skipfooter` instead. skipfooter : int, default 0 Rows at the end to skip (0-indexed). convert_float : bool, default True @@ -277,7 +267,6 @@ @Appender(_read_excel_doc) -@deprecate_kwarg("skip_footer", "skipfooter") def read_excel( io, sheet_name=0, @@ -300,18 +289,15 @@ def read_excel( date_parser=None, thousands=None, comment=None, - skip_footer=0, skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError( - "read_excel() got an unexpected keyword argument `{}`".format(arg) - ) + raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`") if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -344,14 +330,14 @@ def read_excel( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) class _BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO - if _is_url(filepath_or_buffer): + if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) @@ -364,6 +350,8 @@ def __init__(self, filepath_or_buffer): self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -417,10 +405,10 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): - _validate_header_arg(header) + validate_header_arg(header) ret_dict = False @@ -435,13 +423,13 @@ def parse( sheets = [sheet_name] # handle same-type duplicates. - sheets = list(OrderedDict.fromkeys(sheets).keys()) + sheets = list(dict.fromkeys(sheets).keys()) - output = OrderedDict() + output = {} for asheetname in sheets: if verbose: - print("Reading sheet {sheet}".format(sheet=asheetname)) + print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) @@ -517,7 +505,7 @@ def parse( skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) output[asheetname] = parser.read(nrows=nrows) @@ -540,8 +528,10 @@ def parse( class ExcelWriter(metaclass=abc.ABCMeta): """ - Class for writing DataFrame objects into excel sheets, default is to use - xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. + Class for writing DataFrame objects into excel sheets. + + Default is to use xlwt for xls, openpyxl for xlsx. + See DataFrame.to_excel for typical usage. Parameters ---------- @@ -552,10 +542,10 @@ class ExcelWriter(metaclass=abc.ABCMeta): ``io.excel..writer``. NOTE: can only be passed as a keyword argument. date_format : str, default None - Format string for dates written into Excel files (e.g. 'YYYY-MM-DD') + Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None - Format string for datetime objects written into Excel files - (e.g. 'YYYY-MM-DD HH:MM:SS') + Format string for datetime objects written into Excel files. + (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' File mode to use (write or append). @@ -634,11 +624,11 @@ def __new__(cls, path, engine=None, **kwargs): ext = "xlsx" try: - engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) + raise ValueError(f"No engine for filetype: '{ext}'") cls = get_writer(engine) return object.__new__(cls) @@ -694,7 +684,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # validate that this engine can handle the extension if isinstance(path, str): @@ -720,7 +710,7 @@ def __init__( self.mode = mode def __fspath__(self): - return _stringify_path(self.path) + return stringify_path(self.path) def _get_sheet_name(self, sheet_name): if sheet_name is None: @@ -769,9 +759,8 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine '{engine}': '{ext}'".format( - engine=pprint_thing(cls.engine), ext=pprint_thing(ext) - ) + msg = "Invalid extension for engine" + f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" raise ValueError(msg) else: return True @@ -814,13 +803,13 @@ def __init__(self, io, engine=None): if engine is None: engine = "xlrd" if engine not in self._engines: - raise ValueError("Unknown engine: {engine}".format(engine=engine)) + raise ValueError(f"Unknown engine: {engine}") self.engine = engine # could be a str, ExcelFile, Book, etc. self.io = io # Always a string - self._io = _stringify_path(io) + self._io = stringify_path(io) self._reader = self._engines[engine](self._io) @@ -848,7 +837,7 @@ def parse( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - **kwds + **kwds, ): """ Parse specified sheet(s) into a DataFrame. @@ -886,7 +875,7 @@ def parse( skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds + **kwds, ) @property @@ -899,6 +888,12 @@ def sheet_names(self): def close(self): """close io if necessary""" + if self.engine == "openpyxl": + # https://stackoverflow.com/questions/31416842/ + # openpyxl-does-not-close-excel-workbook-in-read-only-mode + wb = self.book + wb._archive.close() + if hasattr(self.io, "close"): self.io.close() @@ -907,3 +902,11 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() + + def __del__(self): + # Ensure we don't leak file descriptors, but put in try/except in case + # attributes are already deleted + try: + self.close() + except AttributeError: + pass diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 3a67f8306fff1..ec5f6fcb17ff8 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,9 +1,9 @@ from typing import List +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency import pandas as pd -from pandas._typing import FilePathOrBuffer, Scalar from pandas.io.excel._base import _BaseExcelReader @@ -61,7 +61,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table - raise ValueError("sheet {} not found".format(name)) + raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """Parse an ODF Table into a list of lists @@ -76,12 +76,12 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: empty_rows = 0 max_row_len = 0 - table = [] # type: List[List[Scalar]] + table: List[List[Scalar]] = [] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 - table_row = [] # type: List[Scalar] + table_row: List[Scalar] = [] for j, sheet_cell in enumerate(sheet_cells): if sheet_cell.qname == table_cell_name: @@ -156,7 +156,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if cell_value == 0.0 and str(cell) != cell_value: # NA handling + if cell_value == 0.0: # NA handling return str(cell) if convert_float: @@ -178,4 +178,4 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError("Unrecognized type {}".format(cell_type)) + raise ValueError(f"Unrecognized type {cell_type}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index d8f5da5ab5bc6..be52523e486af 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,9 +2,8 @@ import numpy as np -from pandas.compat._optional import import_optional_dependency - from pandas._typing import FilePathOrBuffer, Scalar +from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader from pandas.io.excel._util import _validate_freeze_panes @@ -46,7 +45,8 @@ def save(self): @classmethod def _convert_to_style(cls, style_dict): """ - converts a style_dict to an openpyxl style object + Converts a style_dict to an openpyxl style object. + Parameters ---------- style_dict : style dictionary to convert @@ -72,7 +72,8 @@ def _convert_to_style(cls, style_dict): def _convert_to_style_kwargs(cls, style_dict): """ Convert a style_dict to a set of kwargs suitable for initializing - or updating-on-copy an openpyxl v2 style object + or updating-on-copy an openpyxl v2 style object. + Parameters ---------- style_dict : dict @@ -83,6 +84,7 @@ def _convert_to_style_kwargs(cls, style_dict): 'alignment' 'number_format' 'protection' + Returns ------- style_kwargs : dict @@ -97,7 +99,7 @@ def _convert_to_style_kwargs(cls, style_dict): for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) + _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v @@ -107,7 +109,8 @@ def _convert_to_style_kwargs(cls, style_dict): @classmethod def _convert_to_color(cls, color_spec): """ - Convert ``color_spec`` to an openpyxl v2 Color object + Convert ``color_spec`` to an openpyxl v2 Color object. + Parameters ---------- color_spec : str, dict @@ -120,6 +123,7 @@ def _convert_to_color(cls, color_spec): 'tint' 'index' 'type' + Returns ------- color : openpyxl.styles.Color @@ -135,7 +139,8 @@ def _convert_to_color(cls, color_spec): @classmethod def _convert_to_font(cls, font_dict): """ - Convert ``font_dict`` to an openpyxl v2 Font object + Convert ``font_dict`` to an openpyxl v2 Font object. + Parameters ---------- font_dict : dict @@ -154,6 +159,7 @@ def _convert_to_font(cls, font_dict): 'outline' 'shadow' 'condense' + Returns ------- font : openpyxl.styles.Font @@ -185,11 +191,13 @@ def _convert_to_stop(cls, stop_seq): """ Convert ``stop_seq`` to a list of openpyxl v2 Color objects, suitable for initializing the ``GradientFill`` ``stop`` parameter. + Parameters ---------- stop_seq : iterable An iterable that yields objects suitable for consumption by ``_convert_to_color``. + Returns ------- stop : list of openpyxl.styles.Color @@ -200,7 +208,8 @@ def _convert_to_stop(cls, stop_seq): @classmethod def _convert_to_fill(cls, fill_dict): """ - Convert ``fill_dict`` to an openpyxl v2 Fill object + Convert ``fill_dict`` to an openpyxl v2 Fill object. + Parameters ---------- fill_dict : dict @@ -216,6 +225,7 @@ def _convert_to_fill(cls, fill_dict): 'top' 'bottom' 'stop' + Returns ------- fill : openpyxl.styles.Fill @@ -262,7 +272,8 @@ def _convert_to_fill(cls, fill_dict): @classmethod def _convert_to_side(cls, side_spec): """ - Convert ``side_spec`` to an openpyxl v2 Side object + Convert ``side_spec`` to an openpyxl v2 Side object. + Parameters ---------- side_spec : str, dict @@ -270,6 +281,7 @@ def _convert_to_side(cls, side_spec): of the following keys (or their synonyms). 'style' ('border_style') 'color' + Returns ------- side : openpyxl.styles.Side @@ -295,7 +307,8 @@ def _convert_to_side(cls, side_spec): @classmethod def _convert_to_border(cls, border_dict): """ - Convert ``border_dict`` to an openpyxl v2 Border object + Convert ``border_dict`` to an openpyxl v2 Border object. + Parameters ---------- border_dict : dict @@ -311,6 +324,7 @@ def _convert_to_border(cls, border_dict): 'diagonalUp' ('diagonalup') 'diagonalDown' ('diagonaldown') 'outline' + Returns ------- border : openpyxl.styles.Border @@ -335,7 +349,8 @@ def _convert_to_border(cls, border_dict): @classmethod def _convert_to_alignment(cls, alignment_dict): """ - Convert ``alignment_dict`` to an openpyxl v2 Alignment object + Convert ``alignment_dict`` to an openpyxl v2 Alignment object. + Parameters ---------- alignment_dict : dict @@ -515,7 +530,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: - data = [] # type: List[List[Scalar]] + data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 2ba3842d5c0c9..a084be54dfa10 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,5 +1,3 @@ -import warnings - from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -50,7 +48,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) + raise ValueError(f"No Excel writer '{engine_name}'") def _excel2num(x): @@ -78,7 +76,7 @@ def _excel2num(x): cp = ord(c) if cp < ord("A") or cp > ord("Z"): - raise ValueError("Invalid column name: {x}".format(x=x)) + raise ValueError(f"Invalid column name: {x}") index = index * 26 + cp - ord("A") + 1 @@ -136,16 +134,11 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): - warnings.warn( - ( - "Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of int from " - "0 to `usecols` inclusive instead." - ), - FutureWarning, - stacklevel=2, + raise ValueError( + "Passing an integer for `usecols` is no longer supported. " + "Please pass in a list of int from 0 to `usecols` " + "inclusive instead." ) - return list(range(usecols + 1)) if isinstance(usecols, str): return _range2cols(usecols) @@ -161,8 +154,8 @@ def _validate_freeze_panes(freeze_panes): return True raise ValueError( - "freeze_panes must be of form (row, column)" - " where row and column are integers" + "freeze_panes must be of form (row, column) " + "where row and column are integers" ) # freeze_panes wasn't specified, return False so it won't be applied diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 07bf265da4863..6d9ff9be5249a 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -168,7 +168,7 @@ def __init__( date_format=None, datetime_format=None, mode="w", - **engine_kwargs + **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. import xlsxwriter @@ -182,7 +182,7 @@ def __init__( date_format=date_format, datetime_format=datetime_format, mode=mode, - **engine_kwargs + **engine_kwargs, ) self.book = xlsxwriter.Workbook(path, **engine_kwargs) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index fe3d0a208de6a..d102a885cef0a 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -77,7 +77,9 @@ def write_cells( wks.write(startrow + cell.row, startcol + cell.col, val, style) @classmethod - def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): + def _style_to_xlwt( + cls, item, firstlevel: bool = True, field_sep=",", line_sep=";" + ) -> str: """helper which recursively generate an xlwt easy style string for example: @@ -95,20 +97,20 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): if hasattr(item, "items"): if firstlevel: it = [ - "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(line_sep).join(it)) + out = f"{(line_sep).join(it)} " return out else: it = [ - "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(field_sep).join(it)) + out = f"{(field_sep).join(it)} " return out else: - item = "{item}".format(item=item) + item = f"{item}" item = item.replace("True", "on") item = item.replace("False", "off") return item @@ -117,6 +119,7 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): def _convert_to_style(cls, style_dict, num_format_str=None): """ converts a style_dict to an xlwt style object + Parameters ---------- style_dict : style dictionary to convert diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dd6519275ad15..eb05004d9137c 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,16 +1,13 @@ """ feather-format compat """ -from distutils.version import LooseVersion - from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import deprecate_kwarg from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import _stringify_path +from pandas.io.common import stringify_path -def to_feather(df, path): +def to_feather(df: DataFrame, path): """ Write a DataFrame to the feather-format @@ -23,7 +20,7 @@ def to_feather(df, path): import_optional_dependency("pyarrow") from pyarrow import feather - path = _stringify_path(path) + path = stringify_path(path) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -37,10 +34,11 @@ def to_feather(df, path): # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): + typ = type(df.index) raise ValueError( - "feather does not support serializing {} " + f"feather does not support serializing {typ} " "for the index; you can .reset_index() " - "to make the index into column(s)".format(type(df.index)) + "to make the index into column(s)" ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): @@ -66,8 +64,7 @@ def to_feather(df, path): feather.write_feather(df, path) -@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") -def read_feather(path, columns=None, use_threads=True): +def read_feather(path, columns=None, use_threads: bool = True): """ Load a feather-format object from the file path. @@ -89,11 +86,6 @@ def read_feather(path, columns=None, use_threads=True): If not provided, all columns are read. .. versionadded:: 0.24.0 - nthreads : int, default 1 - Number of CPU threads to use when reading to pandas.DataFrame. - - .. versionadded:: 0.21.0 - .. deprecated:: 0.24.0 use_threads : bool, default True Whether to parallelize reading using multiple threads. @@ -103,15 +95,9 @@ def read_feather(path, columns=None, use_threads=True): ------- type of object stored in file """ - pyarrow = import_optional_dependency("pyarrow") + import_optional_dependency("pyarrow") from pyarrow import feather - path = _stringify_path(path) - - if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"): - int_use_threads = int(use_threads) - if int_use_threads < 1: - int_use_threads = 1 - return feather.read_feather(path, columns=columns, nthreads=int_use_threads) + path = stringify_path(path) return feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index 7f8f2fbea2352..bed29e1fd4792 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -6,14 +6,14 @@ def get_console_size(): - """Return console size as tuple = (width, height). + """ + Return console size as tuple = (width, height). Returns (None,None) in non-interactive session. """ from pandas import get_option display_width = get_option("display.width") - # deprecated. display_height = get_option("display.max_rows") # Consider @@ -51,9 +51,13 @@ def get_console_size(): def in_interactive_session(): - """ check if we're running in an interactive shell + """ + Check if we're running in an interactive shell. - returns True if running under python/ipython interactive shell + Returns + ------- + bool + True if running under python/ipython interactive shell. """ from pandas import get_option @@ -72,7 +76,11 @@ def check_main(): def in_ipython_frontend(): """ - check if we're inside an an IPython zmq frontend + Check if we're inside an an IPython zmq frontend. + + Returns + ------- + bool """ try: ip = get_ipython() # noqa diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 92fe87cddb35b..b40d2a57b8106 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -1,4 +1,5 @@ -"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs +""" +Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. """ import re @@ -6,18 +7,37 @@ class CSSWarning(UserWarning): - """This CSS syntax cannot currently be parsed""" + """ + This CSS syntax cannot currently be parsed. + """ pass -class CSSResolver: - """A callable for parsing and resolving CSS to atomic properties +def _side_expander(prop_fmt: str): + def expand(self, prop, value: str): + tokens = value.split() + try: + mapping = self.SIDE_SHORTHANDS[len(tokens)] + except KeyError: + warnings.warn( + f'Could not expand "{prop}: {value}"', CSSWarning, + ) + return + for key, idx in zip(self.SIDES, mapping): + yield prop_fmt.format(key), tokens[idx] + return expand + + +class CSSResolver: + """ + A callable for parsing and resolving CSS to atomic properties. """ def __call__(self, declarations_str, inherited=None): - """ the given declarations to atomic properties + """ + The given declarations to atomic properties. Parameters ---------- @@ -30,8 +50,8 @@ def __call__(self, declarations_str, inherited=None): Returns ------- - props : dict - Atomic CSS 2.2 properties + dict + Atomic CSS 2.2 properties. Examples -------- @@ -53,7 +73,6 @@ def __call__(self, declarations_str, inherited=None): ('font-size', '24pt'), ('font-weight', 'bold')] """ - props = dict(self.atomize(self.parse(declarations_str))) if inherited is None: inherited = {} @@ -93,14 +112,14 @@ def __call__(self, declarations_str, inherited=None): # 3. TODO: resolve other font-relative units for side in self.SIDES: - prop = "border-{side}-width".format(side=side) + prop = f"border-{side}-width" if prop in props: props[prop] = self.size_to_pt( props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS ) for prop in [ - "margin-{side}".format(side=side), - "padding-{side}".format(side=side), + f"margin-{side}", + f"padding-{side}", ]: if prop in props: # TODO: support % @@ -156,7 +175,7 @@ def __call__(self, declarations_str, inherited=None): def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): - warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning) + warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) return self.size_to_pt("1!!default", conversions=conversions) try: @@ -189,9 +208,9 @@ def _error(): val = round(val, 5) if int(val) == val: - size_fmt = "{fmt:d}pt".format(fmt=int(val)) + size_fmt = f"{int(val):d}pt" else: - size_fmt = "{fmt:f}pt".format(fmt=val) + size_fmt = f"{val:f}pt" return size_fmt def atomize(self, declarations): @@ -213,32 +232,21 @@ def atomize(self, declarations): } SIDES = ("top", "right", "bottom", "left") - def _side_expander(prop_fmt): - def expand(self, prop, value): - tokens = value.split() - try: - mapping = self.SIDE_SHORTHANDS[len(tokens)] - except KeyError: - warnings.warn( - 'Could not expand "{prop}: {val}"'.format(prop=prop, val=value), - CSSWarning, - ) - return - for key, idx in zip(self.SIDES, mapping): - yield prop_fmt.format(key), tokens[idx] - - return expand - expand_border_color = _side_expander("border-{:s}-color") expand_border_style = _side_expander("border-{:s}-style") expand_border_width = _side_expander("border-{:s}-width") expand_margin = _side_expander("margin-{:s}") expand_padding = _side_expander("padding-{:s}") - def parse(self, declarations_str): - """Generates (prop, value) pairs from declarations + def parse(self, declarations_str: str): + """ + Generates (prop, value) pairs from declarations. In a future version may generate parsed tokens from tinycss/tinycss2 + + Parameters + ---------- + declarations_str : str """ for decl in declarations_str.split(";"): if not decl.strip(): @@ -251,7 +259,6 @@ def parse(self, declarations_str): yield prop, val else: warnings.warn( - "Ill-formatted attribute: expected a colon " - "in {decl!r}".format(decl=decl), + f"Ill-formatted attribute: expected a colon in {repr(decl)}", CSSWarning, ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e25862537cbfc..0d581f30e50e7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,12 +5,14 @@ import csv as csvlib from io import StringIO import os +from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters +from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -21,11 +23,10 @@ from pandas.core.dtypes.missing import notna from pandas.io.common import ( - UnicodeWriter, - _get_compression_method, - _get_handle, - _infer_compression, + get_compression_method, get_filepath_or_buffer, + get_handle, + infer_compression, ) @@ -33,34 +34,33 @@ class CSVFormatter: def __init__( self, obj, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, cols=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, + header: Union[bool, Sequence[Hashable]] = True, + index: bool = True, + index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Union[str, Mapping[str, str], None] = "infer", + quoting: Optional[int] = None, line_terminator="\n", - chunksize=None, + chunksize: Optional[int] = None, quotechar='"', - date_format=None, - doublequote=True, - escapechar=None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, decimal=".", ): - self.obj = obj if path_or_buf is None: path_or_buf = StringIO() # Extract compression mode as given, if dict - compression, self.compression_args = _get_compression_method(compression) + compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode @@ -77,7 +77,7 @@ def __init__( if encoding is None: encoding = "utf-8" self.encoding = encoding - self.compression = _infer_compression(self.path_or_buf, compression) + self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL @@ -154,14 +154,17 @@ def __init__( if not index: self.nlevels = 0 - def save(self): + def save(self) -> None: """ - Create the writer & save + Create the writer & save. """ # GH21227 internal compression is not used when file-like passed. if self.compression and hasattr(self.path_or_buf, "write"): - msg = "compression has no effect when passing file-like object as input." - warnings.warn(msg, RuntimeWarning, stacklevel=2) + warnings.warn( + "compression has no effect when passing file-like object as input.", + RuntimeWarning, + stacklevel=2, + ) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or ( @@ -178,7 +181,7 @@ def save(self): f = self.path_or_buf close = False else: - f, handles = _get_handle( + f, handles = get_handle( self.path_or_buf, self.mode, encoding=self.encoding, @@ -187,7 +190,9 @@ def save(self): close = True try: - writer_kwargs = dict( + # Note: self.encoding is irrelevant here + self.writer = csvlib.writer( + f, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -195,11 +200,6 @@ def save(self): escapechar=self.escapechar, quotechar=self.quotechar, ) - if self.encoding == "ascii": - self.writer = csvlib.writer(f, **writer_kwargs) - else: - writer_kwargs["encoding"] = self.encoding - self.writer = UnicodeWriter(f, **writer_kwargs) self._save() @@ -212,7 +212,7 @@ def save(self): else: compression = dict(self.compression_args, method=self.compression) - f, handles = _get_handle( + f, handles = get_handle( self.path_or_buf, self.mode, encoding=self.encoding, @@ -226,14 +226,13 @@ def save(self): _fh.close() def _save_header(self): - writer = self.writer obj = self.obj index_label = self.index_label cols = self.cols has_mi_columns = self.has_mi_columns header = self.header - encoded_labels = [] + encoded_labels: List[str] = [] has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) if not (has_aliases or self.header): @@ -241,10 +240,7 @@ def _save_header(self): if has_aliases: if len(header) != len(cols): raise ValueError( - ( - "Writing {ncols} cols but got {nalias} " - "aliases".format(ncols=len(cols), nalias=len(header)) - ) + f"Writing {len(cols)} cols but got {len(header)} aliases" ) else: write_cols = header @@ -309,8 +305,7 @@ def _save_header(self): encoded_labels.extend([""] * len(columns)) writer.writerow(encoded_labels) - def _save(self): - + def _save(self) -> None: self._save_header() nrows = len(self.data_index) @@ -327,8 +322,7 @@ def _save(self): self._save_chunk(start_i, end_i) - def _save_chunk(self, start_i, end_i): - + def _save_chunk(self, start_i: int, end_i: int) -> None: data_index = self.data_index # create the data for a chunk diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index b9c847ad64c57..9b0f100c1b041 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -4,6 +4,7 @@ from functools import reduce import itertools import re +from typing import Callable, Dict, List, Optional, Sequence, Union import warnings import numpy as np @@ -15,6 +16,7 @@ from pandas import Index import pandas.core.common as com +from pandas.io.common import stringify_path from pandas.io.formats.css import CSSResolver, CSSWarning from pandas.io.formats.format import get_level_lengths from pandas.io.formats.printing import pprint_thing @@ -24,7 +26,9 @@ class ExcelCell: __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") __slots__ = __fields__ - def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None): + def __init__( + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + ): self.row = row self.col = col self.val = val @@ -55,7 +59,7 @@ class CSSToExcelConverter: # instancemethods so that users can easily experiment with extensions # without monkey-patching. - def __init__(self, inherited=None): + def __init__(self, inherited: Optional[str] = None): if inherited is not None: inherited = self.compute_css(inherited) @@ -63,8 +67,9 @@ def __init__(self, inherited=None): compute_css = CSSResolver() - def __call__(self, declarations_str): - """Convert CSS declarations to ExcelWriter style + def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: + """ + Convert CSS declarations to ExcelWriter style. Parameters ---------- @@ -82,7 +87,7 @@ def __call__(self, declarations_str): properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props): + def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -90,9 +95,10 @@ def build_xlstyle(self, props): "font": self.build_font(props), "number_format": self.build_number_format(props), } + # TODO: handle cell width and height: needs support in pandas.io.excel - def remove_none(d): + def remove_none(d: Dict[str, str]) -> None: """Remove key where value is None, through nested dicts""" for k, v in list(d.items()): if v is None: @@ -115,7 +121,7 @@ def remove_none(d): # OpenXML also has 'justify', 'distributed' } - def build_alignment(self, props): + def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), @@ -127,21 +133,19 @@ def build_alignment(self, props): ), } - def build_border(self, props): + def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: return { side: { "style": self._border_style( - props.get("border-{side}-style".format(side=side)), - props.get("border-{side}-width".format(side=side)), - ), - "color": self.color_to_excel( - props.get("border-{side}-color".format(side=side)) + props.get(f"border-{side}-style"), + props.get(f"border-{side}-width"), ), + "color": self.color_to_excel(props.get(f"border-{side}-color")), } for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style, width): + def _border_style(self, style: Optional[str], width): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -190,7 +194,7 @@ def _border_style(self, style, width): return "dashed" return "mediumDashed" - def build_fill(self, props): + def build_fill(self, props: Dict[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") @@ -214,7 +218,7 @@ def build_fill(self, props): } ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} - def build_font(self, props): + def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: size = props.get("font-size") if size is not None: assert size.endswith("pt") @@ -279,6 +283,7 @@ def build_font(self, props): if "text-shadow" in props else None ), + # FIXME: dont leave commented-out # 'vertAlign':, # 'charset': , # 'scheme': , @@ -309,7 +314,7 @@ def build_font(self, props): "white": "FFFFFF", } - def color_to_excel(self, val): + def color_to_excel(self, val: Optional[str]): if val is None: return None if val.startswith("#") and len(val) == 7: @@ -319,9 +324,9 @@ def color_to_excel(self, val): try: return self.NAMED_COLORS[val] except KeyError: - warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning) + warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) - def build_number_format(self, props): + def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: return {"format_code": props.get("number-format")} @@ -364,15 +369,15 @@ class ExcelFormatter: def __init__( self, df, - na_rep="", - float_format=None, - cols=None, - header=True, - index=True, - index_label=None, - merge_cells=False, - inf_rep="inf", - style_converter=None, + na_rep: str = "", + float_format: Optional[str] = None, + cols: Optional[Sequence] = None, + header: Union[bool, List[str]] = True, + index: bool = True, + index_label: Union[str, Sequence, None] = None, + merge_cells: bool = False, + inf_rep: str = "inf", + style_converter: Optional[Callable] = None, ): self.rowcounter = 0 self.na_rep = na_rep @@ -391,16 +396,12 @@ def __init__( if not len(Index(cols) & df.columns): raise KeyError("passes columns are not ALL present dataframe") - # deprecatedin gh-17295 - # 1 missing is ok (for now) if len(Index(cols) & df.columns) != len(cols): - warnings.warn( - "Not all names specified in 'columns' are found; " - "this will raise a KeyError in the future", - FutureWarning, - ) + # Deprecated in GH#17295, enforced in 1.0.0 + raise KeyError("Not all names specified in 'columns' are found") + + self.df = df - self.df = df.reindex(columns=cols) self.columns = self.df.columns self.float_format = float_format self.index = index @@ -429,7 +430,7 @@ def _format_value(self, val): if missing.isposinf_scalar(val): val = self.inf_rep elif missing.isneginf_scalar(val): - val = "-{inf}".format(inf=self.inf_rep) + val = f"-{self.inf_rep}" elif self.float_format is not None: val = float(self.float_format % val) if getattr(val, "tzinfo", None) is not None: @@ -444,10 +445,8 @@ def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( - "Writing to Excel with MultiIndex" - " columns and no index " - "('index'=False) is not yet " - "implemented." + "Writing to Excel with MultiIndex columns and no " + "index ('index'=False) is not yet implemented." ) has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) @@ -511,8 +510,8 @@ def _format_header_regular(self): if has_aliases: if len(self.header) != len(self.columns): raise ValueError( - "Writing {cols} cols but got {alias} " - "aliases".format(cols=len(self.columns), alias=len(self.header)) + f"Writing {len(self.columns)} cols but got {len(self.header)} " + "aliases" ) else: colnames = self.header @@ -542,7 +541,6 @@ def _format_header(self): return itertools.chain(gen, gen2) def _format_body(self): - if isinstance(self.df.index, ABCMultiIndex): return self._format_hierarchical_rows() else: @@ -665,7 +663,7 @@ def _format_hierarchical_rows(self): for cell in self._generate_body(gcolidx): yield cell - def _generate_body(self, coloffset): + def _generate_body(self, coloffset: int): if self.styler is None: styles = None else: @@ -714,20 +712,18 @@ def write( and ``io.excel.xlsm.writer``. """ from pandas.io.excel import ExcelWriter - from pandas.io.common import _stringify_path num_rows, num_cols = self.df.shape if num_rows > self.max_rows or num_cols > self.max_cols: raise ValueError( - "This sheet is too large! Your sheet size is: " - + "{}, {} ".format(num_rows, num_cols) - + "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols) + f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} " + f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) if isinstance(writer, ExcelWriter): need_save = False else: - writer = ExcelWriter(_stringify_path(writer), engine=engine) + writer = ExcelWriter(stringify_path(writer), engine=engine) need_save = True formatted_cells = self.get_formatted_cells() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 17603809c2ea6..6adf69a922000 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -3,7 +3,6 @@ and latex files. This module also applies to display formatting. """ -import codecs from contextlib import contextmanager from datetime import tzinfo import decimal @@ -35,9 +34,11 @@ from pandas._config.config import get_option, set_option from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType +from pandas._typing import FilePathOrBuffer from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( @@ -63,16 +64,15 @@ ) from pandas.core.dtypes.missing import isna, notna -from pandas._typing import FilePathOrBuffer from pandas.core.arrays.datetimes import DatetimeArray from pandas.core.arrays.timedeltas import TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.index import Index, ensure_index +from pandas.core.indexes.api import Index, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.io.common import _stringify_path +from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: @@ -231,7 +231,7 @@ def __init__( self, series: "Series", buf: Optional[IO[str]] = None, - length: bool = True, + length: Union[bool, str] = True, header: bool = True, index: bool = True, na_rep: str = "NaN", @@ -262,6 +262,8 @@ def __init__( def _chk_truncate(self) -> None: from pandas.core.reshape.concat import concat + self.tr_row_num: Optional[int] + min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows @@ -279,8 +281,10 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) - self.tr_row_num = row_num # type: Optional[int] + series = series._ensure_type( + concat((series.iloc[:row_num], series.iloc[-row_num:])) + ) + self.tr_row_num = row_num else: self.tr_row_num = None self.tr_series = series @@ -350,7 +354,7 @@ def to_string(self) -> str: if len(series) == 0: return "{name}([], {footer})".format( - name=self.series.__class__.__name__, footer=footer + name=type(self.series).__name__, footer=footer ) fmt_index, have_header = self._get_formatted_index() @@ -448,13 +452,13 @@ def _get_adjustment() -> TextAdjustment: class TableFormatter: - show_dimensions = None # type: bool - is_truncated = None # type: bool - formatters = None # type: formatters_type - columns = None # type: Index + show_dimensions: Union[bool, str] + is_truncated: bool + formatters: formatters_type + columns: Index @property - def should_show_dimensions(self) -> Optional[bool]: + def should_show_dimensions(self) -> bool: return self.show_dimensions is True or ( self.show_dimensions == "truncate" and self.is_truncated ) @@ -480,7 +484,7 @@ def get_buffer( objects, otherwise yield buf unchanged. """ if buf is not None: - buf = _stringify_path(buf) + buf = stringify_path(buf) else: buf = StringIO() @@ -492,7 +496,11 @@ def get_buffer( if hasattr(buf, "write"): yield buf elif isinstance(buf, str): - with codecs.open(buf, "w", encoding=encoding) as f: + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename yield f else: raise TypeError("buf is not a file name and it has no write method") @@ -548,7 +556,7 @@ def __init__( max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, - show_dimensions: bool = False, + show_dimensions: Union[bool, str] = False, decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, @@ -571,8 +579,8 @@ def __init__( else: raise ValueError( ( - "Formatters length({flen}) should match" - " DataFrame number of columns({dlen})" + "Formatters length({flen}) should match " + "DataFrame number of columns({dlen})" ).format(flen=len(formatters), dlen=len(frame.columns)) ) self.na_rep = na_rep @@ -616,6 +624,8 @@ def _chk_truncate(self) -> None: # Cut the data to the information actually printed max_cols = self.max_cols max_rows = self.max_rows + self.max_rows_adj: Optional[int] + max_rows_adj: Optional[int] if max_cols == 0 or max_rows == 0: # assume we are in the terminal (w, h) = get_terminal_size() @@ -631,7 +641,7 @@ def _chk_truncate(self) -> None: self.header = cast(bool, self.header) n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data - max_rows_adj = self.h - n_add_rows # type: Optional[int] + max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj # Format only rows and columns that could potentially fit the @@ -976,7 +986,7 @@ def to_html( ) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: - from pandas.core.index import _sparsify + from pandas.core.indexes.multi import _sparsify columns = frame.columns @@ -1073,7 +1083,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: return adjoined def _get_column_name_list(self) -> List[str]: - names = [] # type: List[str] + names: List[str] = [] columns = self.frame.columns if isinstance(columns, ABCMultiIndex): names.extend("" if name is None else name for name in columns.names) @@ -1124,8 +1134,9 @@ def format_array( List[str] """ + fmt_klass: Type[GenericArrayFormatter] if is_datetime64_dtype(values.dtype): - fmt_klass = Datetime64Formatter # type: Type[GenericArrayFormatter] + fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter elif is_timedelta64_dtype(values.dtype): @@ -1218,6 +1229,8 @@ def _format(x): # determine na_rep if x is None or NaT-like if x is None: return "None" + elif x is NA: + return str(NA) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): @@ -1265,7 +1278,7 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) # float_format is expected to be a string # formatter should be used to pass a function @@ -1375,11 +1388,12 @@ def format_values_with(float_format): # There is a special default string when we are fixed-width # The default is otherwise to use str instead of a formatting string + float_format: Optional[float_format_type] if self.float_format is None: if self.fixed_width: float_format = partial( "{value: .{digits:d}f}".format, digits=self.digits - ) # type: Optional[float_format_type] + ) else: float_format = self.float_format else: @@ -1437,7 +1451,7 @@ def __init__( values: Union[np.ndarray, "Series", DatetimeIndex, DatetimeArray], nat_rep: str = "NaT", date_format: None = None, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep @@ -1628,7 +1642,7 @@ def _get_format_datetime64_from_values( """ given values and a date_format, return a string format """ if isinstance(values, np.ndarray) and values.ndim > 1: - # We don't actaully care about the order of values, and DatetimeIndex + # We don't actually care about the order of values, and DatetimeIndex # only accepts 1D values values = values.ravel() @@ -1658,7 +1672,7 @@ def __init__( values: Union[np.ndarray, TimedeltaIndex], nat_rep: str = "NaT", box: bool = False, - **kwargs + **kwargs, ): super().__init__(values, **kwargs) self.nat_rep = nat_rep diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 38f2e332017f0..b46b2f6c671d6 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -2,17 +2,18 @@ Module for formatting output data in HTML. """ -from collections import OrderedDict from textwrap import dedent from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option +from pandas._libs import lib + from pandas.core.dtypes.generic import ABCMultiIndex from pandas import option_context -from pandas.io.common import _is_url +from pandas.io.common import is_url from pandas.io.formats.format import ( DataFrameFormatter, TableFormatter, @@ -45,7 +46,7 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns - self.elements = [] # type: List[str] + self.elements: List[str] = [] self.bold_rows = self.fmt.bold_rows self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions @@ -109,12 +110,12 @@ def write_th( ---------- s : object The data to be written inside the cell. - header : boolean, default False + header : bool, default False Set to True if the . This will cause min-width to be set if there is one. indent : int, default 0 The indentation level of the cell. - tags : string, default None + tags : str, default None Tags to include in the cell. Returns @@ -140,15 +141,13 @@ def _write_cell( if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict( - [("&", r"&"), ("<", r"<"), (">", r">")] - ) # type: Union[OrderedDict[str, str], Dict] + esc = {"&": r"&", "<": r"<", ">": r">"} else: esc = {} rs = pprint_thing(s, escape_chars=esc).strip() - if self.render_links and _is_url(rs): + if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() start_tag += ''.format(url=rs_unescaped) end_a = "" @@ -248,7 +247,7 @@ def _write_col_header(self, indent: int) -> None: if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default else: sentinel = False levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) @@ -408,7 +407,7 @@ def _write_regular_rows( else: index_values = self.fmt.tr_frame.index.format() - row = [] # type: List[str] + row: List[str] = [] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): @@ -454,7 +453,7 @@ def _write_hierarchical_rows( if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index ca9db88ae7be4..008a99427f3c7 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -11,8 +11,8 @@ class LatexFormatter(TableFormatter): - """ Used to render a DataFrame to a LaTeX tabular/longtable environment - output. + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- @@ -106,18 +106,19 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] - column_format = self.column_format - if column_format is None: + if self.column_format is None: dtypes = self.frame.dtypes._values column_format = "".join(map(get_col_type, dtypes)) if self.fmt.index: index_format = "l" * self.frame.index.nlevels column_format = index_format + column_format - elif not isinstance(column_format, str): # pragma: no cover + elif not isinstance(self.column_format, str): # pragma: no cover raise AssertionError( "column_format must be str or unicode, " "not {typ}".format(typ=type(column_format)) ) + else: + column_format = self.column_format if self.longtable: self._write_longtable_begin(buf, column_format) @@ -132,7 +133,7 @@ def pad_empties(x): if self.fmt.has_index_names and self.fmt.show_index_names: nlevels += 1 strrows = list(zip(*strcols)) - self.clinebuf = [] # type: List[List[int]] + self.clinebuf: List[List[int]] = [] for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: @@ -265,7 +266,7 @@ def _format_multirow( def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: """ - Print clines after multirow-blocks are finished + Print clines after multirow-blocks are finished. """ for cl in self.clinebuf: if cl[0] == i: @@ -273,7 +274,7 @@ def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] - def _write_tabular_begin(self, buf, column_format): + def _write_tabular_begin(self, buf, column_format: str): """ Write the beginning of a tabular environment or nested table/tabular environments including caption and label. @@ -283,11 +284,10 @@ def _write_tabular_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ if self.caption is not None or self.label is not None: # then write output in a nested table/tabular environment @@ -327,7 +327,7 @@ def _write_tabular_end(self, buf): else: pass - def _write_longtable_begin(self, buf, column_format): + def _write_longtable_begin(self, buf, column_format: str): """ Write the beginning of a longtable environment including caption and label if provided by user. @@ -337,11 +337,10 @@ def _write_longtable_begin(self, buf, column_format): buf : string or file handle File path or object. If not specified, the result is returned as a string. - column_format : str, default None + column_format : str The columns format as specified in `LaTeX table format `__ e.g 'rcl' for 3 columns - """ buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 061103820ca83..4b5b5e9a0ce15 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -1,5 +1,5 @@ """ -printing tools +Printing tools. """ import sys @@ -182,13 +182,12 @@ def pprint_thing( replacements default_escapes : bool, default False Whether the input escape characters replaces or adds to the defaults - max_seq_items : False, int, default None - Pass thru to other pretty printers to limit sequence printing + max_seq_items : int or None, default None + Pass through to other pretty printers to limit sequence printing Returns ------- str - """ def as_escaped_string( @@ -312,7 +311,6 @@ def format_object_summary( Returns ------- summary string - """ from pandas.io.formats.console import get_console_size from pandas.io.formats.format import _get_adjustment @@ -321,12 +319,12 @@ def format_object_summary( if display_width is None: display_width = get_option("display.width") or 80 if name is None: - name = obj.__class__.__name__ + name = type(obj).__name__ if indent_for_name: name_len = len(name) - space1 = "\n%s" % (" " * (name_len + 1)) - space2 = "\n%s" % (" " * (name_len + 2)) + space1 = f'\n{(" " * (name_len + 1))}' + space2 = f'\n{(" " * (name_len + 2))}' else: space1 = "\n" space2 = "\n " # space for the opening '[' @@ -346,7 +344,9 @@ def format_object_summary( # adj can optionally handle unicode eastern asian width adj = _get_adjustment() - def _extend_line(s, line, value, display_width, next_line_prefix): + def _extend_line( + s: str, line: str, value: str, display_width: int, next_line_prefix: str + ) -> Tuple[str, str]: if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: s += line.rstrip() @@ -354,7 +354,7 @@ def _extend_line(s, line, value, display_width, next_line_prefix): line += value return s, line - def best_len(values): + def best_len(values: List[str]) -> int: if values: return max(adj.len(x) for x in values) else: @@ -363,14 +363,14 @@ def best_len(values): close = ", " if n == 0: - summary = "[]{}".format(close) + summary = f"[]{close}" elif n == 1 and not line_break_each_value: first = formatter(obj[0]) - summary = "[{}]{}".format(first, close) + summary = f"[{first}]{close}" elif n == 2 and not line_break_each_value: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = "[{}, {}]{}".format(first, last, close) + summary = f"[{first}, {last}]{close}" else: if n > max_seq_items: @@ -513,10 +513,10 @@ def format_object_attrs( list of 2-tuple """ - attrs = [] # type: List[Tuple[str, Union[str, int]]] + attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sequence[Any]" has no attribute "dtype" - attrs.append(("dtype", "'{}'".format(obj.dtype))) # type: ignore + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore if getattr(obj, "name", None) is not None: # error: "Sequence[Any]" has no attribute "name" attrs.append(("name", default_pprint(obj.name))) # type: ignore diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 545d6a674411a..8570875569e44 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -8,17 +8,18 @@ import copy from functools import partial from itertools import product +from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple from uuid import uuid1 import numpy as np from pandas._config import get_option +from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_float, is_string_like -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.common import is_float import pandas as pd from pandas.api.types import is_dict_like, is_list_like @@ -71,6 +72,11 @@ class Styler: The ``id`` takes the form ``T__row_col`` where ```` is the unique identifier, ```` is the row number and ```` is the column number. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Attributes ---------- @@ -126,9 +132,10 @@ def __init__( caption=None, table_attributes=None, cell_ids=True, + na_rep: Optional[str] = None, ): - self.ctx = defaultdict(list) - self._todo = [] + self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) + self._todo: List[Tuple[Callable, Tuple, Dict]] = [] if not isinstance(data, (pd.Series, pd.DataFrame)): raise TypeError("``data`` must be a Series or DataFrame") @@ -149,19 +156,24 @@ def __init__( self.precision = precision self.table_attributes = table_attributes self.hidden_index = False - self.hidden_columns = [] + self.hidden_columns: Sequence[int] = [] self.cell_ids = cell_ids + self.na_rep = na_rep # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if is_float(x): - display_format = "{0:.{precision}f}".format(x, precision=self.precision) + if self.na_rep is not None and pd.isna(x): + return self.na_rep + elif is_float(x): + display_format = f"{x:.{self.precision}f}" return display_format else: return x - self._display_funcs = defaultdict(lambda: default_display_func) + self._display_funcs: DefaultDict[ + Tuple[int, int], Callable[[Any], str] + ] = defaultdict(lambda: default_display_func) def _repr_html_(self): """ @@ -244,7 +256,7 @@ def _translate(self): BLANK_VALUE = "" def format_attr(pair): - return "{key}={value}".format(**pair) + return f"{pair['key']}={pair['value']}" # for sparsifying a MultiIndex idx_lengths = _get_level_lengths(self.index) @@ -282,7 +294,7 @@ def format_attr(pair): name = self.data.columns.names[r] cs = [ BLANK_CLASS if name is None else INDEX_NAME_CLASS, - "level{lvl}".format(lvl=r), + f"level{r}", ] name = BLANK_VALUE if name is None else name row_es.append( @@ -299,8 +311,8 @@ def format_attr(pair): for c, value in enumerate(clabels[r]): cs = [ COL_HEADING_CLASS, - "level{lvl}".format(lvl=r), - "col{col}".format(col=c), + f"level{r}", + f"col{c}", ] cs.extend( cell_context.get("col_headings", {}).get(r, {}).get(c, []) @@ -328,7 +340,7 @@ def format_attr(pair): index_header_row = [] for c, name in enumerate(self.data.index.names): - cs = [INDEX_NAME_CLASS, "level{lvl}".format(lvl=c)] + cs = [INDEX_NAME_CLASS, f"level{c}"] name = "" if name is None else name index_header_row.append( {"type": "th", "value": name, "class": " ".join(cs)} @@ -347,8 +359,8 @@ def format_attr(pair): for c, value in enumerate(rlabels[r]): rid = [ ROW_HEADING_CLASS, - "level{lvl}".format(lvl=c), - "row{row}".format(row=r), + f"level{c}", + f"row{r}", ] es = { "type": "th", @@ -366,7 +378,7 @@ def format_attr(pair): row_es.append(es) for c, col in enumerate(self.data.columns): - cs = [DATA_CLASS, "row{row}".format(row=r), "col{col}".format(col=c)] + cs = [DATA_CLASS, f"row{r}", f"col{c}"] cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] @@ -388,12 +400,7 @@ def format_attr(pair): props.append(x.split(":")) else: props.append(["", ""]) - cellstyle.append( - { - "props": props, - "selector": "row{row}_col{col}".format(row=r, col=c), - } - ) + cellstyle.append({"props": props, "selector": f"row{r}_col{c}"}) body.append(row_es) table_attr = self.table_attributes @@ -416,16 +423,22 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None): """ Format the text display value of cells. Parameters ---------- - formatter : str, callable, or dict + formatter : str, callable, dict or None + If ``formatter`` is None, the default formatter is used subset : IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. + na_rep : str, optional + Representation for missing values. + If ``na_rep`` is None, no special formatting is applied + + .. versionadded:: 1.0.0 Returns ------- @@ -451,6 +464,10 @@ def format(self, formatter, subset=None): >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'c': str.upper}) """ + if formatter is None: + assert self._display_funcs.default_factory is not None + formatter = self._display_funcs.default_factory() + if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) @@ -466,16 +483,16 @@ def format(self, formatter, subset=None): if is_dict_like(formatter): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas - col_formatter = _maybe_wrap_formatter(col_formatter) + col_formatter = _maybe_wrap_formatter(col_formatter, na_rep) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with + formatter = _maybe_wrap_formatter(formatter, na_rep) locs = product(*(row_locs, col_locs)) for i, j in locs: - formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self @@ -553,6 +570,7 @@ def _copy(self, deepcopy=False): caption=self.caption, uuid=self.uuid, table_styles=self.table_styles, + na_rep=self.na_rep, ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) @@ -605,29 +623,25 @@ def _apply(self, func, axis=0, subset=None, **kwargs): result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): raise TypeError( - "Function {func!r} must return a DataFrame when " - "passed to `Styler.apply` with axis=None".format(func=func) + f"Function {repr(func)} must return a DataFrame when " + f"passed to `Styler.apply` with axis=None" ) if not ( result.index.equals(data.index) and result.columns.equals(data.columns) ): - msg = ( - "Result of {func!r} must have identical index and " - "columns as the input".format(func=func) + raise ValueError( + f"Result of {repr(func)} must have identical " + f"index and columns as the input" ) - raise ValueError(msg) result_shape = result.shape expected_shape = self.data.loc[subset].shape if result_shape != expected_shape: - msg = ( - "Function {func!r} returned the wrong shape.\n" - "Result has shape: {res}\n" - "Expected shape: {expect}".format( - func=func, res=result.shape, expect=expected_shape - ) + raise ValueError( + f"Function {repr(func)} returned the wrong shape.\n" + f"Result has shape: {result.shape}\n" + f"Expected shape: {expected_shape}" ) - raise ValueError(msg) self._update_ctx(result) return self @@ -896,6 +910,23 @@ def set_table_styles(self, table_styles): self.table_styles = table_styles return self + def set_na_rep(self, na_rep: str) -> "Styler": + """ + Set the missing data representation on a Styler. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + na_rep : str + + Returns + ------- + self : Styler + """ + self.na_rep = na_rep + return self + def hide_index(self): """ Hide any indices from rendering. @@ -936,9 +967,7 @@ def hide_columns(self, subset): @staticmethod def _highlight_null(v, null_color): - return ( - "background-color: {color}".format(color=null_color) if pd.isna(v) else "" - ) + return f"background-color: {null_color}" if pd.isna(v) else "" def highlight_null(self, null_color="red"): """ @@ -963,6 +992,8 @@ def background_gradient( axis=0, subset=None, text_color_threshold=0.408, + vmin: Optional[float] = None, + vmax: Optional[float] = None, ): """ Color the background in a gradient style. @@ -991,6 +1022,18 @@ def background_gradient( .. versionadded:: 0.24.0 + vmin : float, optional + Minimum data value that corresponds to colormap minimum value. + When None (default): the minimum value of the data will be used. + + .. versionadded:: 1.0.0 + + vmax : float, optional + Maximum data value that corresponds to colormap maximum value. + When None (default): the maximum value of the data will be used. + + .. versionadded:: 1.0.0 + Returns ------- self : Styler @@ -1017,11 +1060,21 @@ def background_gradient( low=low, high=high, text_color_threshold=text_color_threshold, + vmin=vmin, + vmax=vmax, ) return self @staticmethod - def _background_gradient(s, cmap="PuBu", low=0, high=0, text_color_threshold=0.408): + def _background_gradient( + s, + cmap="PuBu", + low=0, + high=0, + text_color_threshold=0.408, + vmin: Optional[float] = None, + vmax: Optional[float] = None, + ): """ Color background in a range according to the data. """ @@ -1033,14 +1086,14 @@ def _background_gradient(s, cmap="PuBu", low=0, high=0, text_color_threshold=0.4 raise ValueError(msg) with _mpl(Styler.background_gradient) as (plt, colors): - smin = s.values.min() - smax = s.values.max() + smin = np.nanmin(s.to_numpy()) if vmin is None else vmin + smax = np.nanmax(s.to_numpy()) if vmax is None else vmax rng = smax - smin # extend lower / upper bounds, compresses color range norm = colors.Normalize(smin - (rng * low), smax + (rng * high)) # matplotlib colors.Normalize modifies inplace? # https://github.com/matplotlib/matplotlib/issues/5427 - rgbas = plt.cm.get_cmap(cmap)(norm(s.values)) + rgbas = plt.cm.get_cmap(cmap)(norm(s.to_numpy(dtype=float))) def relative_luminance(rgba): """ @@ -1067,9 +1120,7 @@ def relative_luminance(rgba): def css(rgba): dark = relative_luminance(rgba) < text_color_threshold text_color = "#f1f1f1" if dark else "#000000" - return "background-color: {b};color: {c};".format( - b=colors.rgb2hex(rgba), c=text_color - ) + return f"background-color: {colors.rgb2hex(rgba)};color: {text_color};" if s.ndim == 1: return [css(rgba) for rgba in rgbas] @@ -1101,7 +1152,7 @@ def set_properties(self, subset=None, **kwargs): >>> df.style.set_properties(color="white", align="right") >>> df.style.set_properties(**{'background-color': 'yellow'}) """ - values = ";".join("{p}: {v}".format(p=p, v=v) for p, v in kwargs.items()) + values = ";".join(f"{p}: {v}" for p, v in kwargs.items()) f = lambda x: values return self.applymap(f, subset=subset) @@ -1111,12 +1162,8 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): Draw bar chart in dataframe cells. """ # Get input value range. - smin = s.min() if vmin is None else vmin - if isinstance(smin, ABCSeries): - smin = smin.min() - smax = s.max() if vmax is None else vmax - if isinstance(smax, ABCSeries): - smax = smax.max() + smin = np.nanmin(s.to_numpy()) if vmin is None else vmin + smax = np.nanmax(s.to_numpy()) if vmax is None else vmax if align == "mid": smin = min(0, smin) smax = max(0, smax) @@ -1125,7 +1172,7 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): smax = max(abs(smin), abs(smax)) smin = -smax # Transform to percent-range of linear-gradient - normed = width * (s.values - smin) / (smax - smin + 1e-12) + normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12) zero = -width * smin / (smax - smin + 1e-12) def css_bar(start, end, color): @@ -1136,12 +1183,9 @@ def css_bar(start, end, color): if end > start: css += "background: linear-gradient(90deg," if start > 0: - css += " transparent {s:.1f}%, {c} {s:.1f}%, ".format( - s=start, c=color - ) - css += "{c} {e:.1f}%, transparent {e:.1f}%)".format( - e=min(end, width), c=color - ) + css += f" transparent {start:.1f}%, {color} {start:.1f}%, " + e = min(end, width) + css += f"{color} {e:.1f}%, transparent {e:.1f}%)" return css def css(x): @@ -1229,9 +1273,9 @@ def bar( color = [color[0], color[0]] elif len(color) > 2: raise ValueError( - "`color` must be string or a list-like" - " of length 2: [`color_neg`, `color_pos`]" - " (eg: color=['#d65f5f', '#5fba7d'])" + "`color` must be string or a list-like " + "of length 2: [`color_neg`, `color_pos`] " + "(eg: color=['#d65f5f', '#5fba7d'])" ) subset = _maybe_numeric_slice(self.data, subset) @@ -1303,18 +1347,16 @@ def _highlight_extrema(data, color="yellow", max_=True): """ Highlight the min or max in a Series or DataFrame. """ - attr = "background-color: {0}".format(color) + attr = f"background-color: {color}" + + if max_: + extrema = data == np.nanmax(data.to_numpy()) + else: + extrema = data == np.nanmin(data.to_numpy()) + if data.ndim == 1: # Series from .apply - if max_: - extrema = data == data.max() - else: - extrema = data == data.min() return [attr if v else "" for v in extrema] else: # DataFrame from .tee - if max_: - extrema = data == data.max().max() - else: - extrema = data == data.min().min() return pd.DataFrame( np.where(extrema, attr, ""), index=data.index, columns=data.columns ) @@ -1432,10 +1474,9 @@ def _get_level_lengths(index, hidden_elements=None): Optional argument is a list of index positions which should not be visible. - Result is a dictionary of (level, inital_position): span + Result is a dictionary of (level, initial_position): span """ - sentinel = object() - levels = index.format(sparsify=sentinel, adjoin=False, names=False) + levels = index.format(sparsify=lib.no_default, adjoin=False, names=False) if hidden_elements is None: hidden_elements = [] @@ -1451,10 +1492,10 @@ def _get_level_lengths(index, hidden_elements=None): for j, row in enumerate(lvl): if not get_option("display.multi_sparse"): lengths[(i, j)] = 1 - elif (row != sentinel) and (j not in hidden_elements): + elif (row is not lib.no_default) and (j not in hidden_elements): last_label = j lengths[(i, last_label)] = 1 - elif row != sentinel: + elif row is not lib.no_default: # even if its hidden, keep track of it in case # length >1 and later elements are visible last_label = j @@ -1469,14 +1510,19 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter): - if is_string_like(formatter): - return lambda x: formatter.format(x) +def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): + if isinstance(formatter, str): + formatter_func = lambda x: formatter.format(x) elif callable(formatter): - return formatter + formatter_func = formatter else: - msg = ( - "Expected a template string or callable, got {formatter} " - "instead".format(formatter=formatter) - ) + msg = f"Expected a template string or callable, got {formatter} instead" + raise TypeError(msg) + + if na_rep is None: + return formatter_func + elif isinstance(na_rep, str): + return lambda x: na_rep if pd.isna(x) else formatter_func(x) + else: + msg = f"Expected a string, got {na_rep} instead" raise TypeError(msg) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d29078cad9318..69ebc470fba6f 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,6 +1,11 @@ """ Google BigQuery support """ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + from pandas.compat._optional import import_optional_dependency +if TYPE_CHECKING: + from pandas import DataFrame + def _try_import(): # since pandas is a dependency of pandas-gbq @@ -14,20 +19,21 @@ def _try_import(): def read_gbq( - query, - project_id=None, - index_col=None, - col_order=None, - reauth=False, - auth_local_webserver=False, - dialect=None, - location=None, - configuration=None, + query: str, + project_id: Optional[str] = None, + index_col: Optional[str] = None, + col_order: Optional[List[str]] = None, + reauth: bool = False, + auth_local_webserver: bool = False, + dialect: Optional[str] = None, + location: Optional[str] = None, + configuration: Optional[Dict[str, Any]] = None, credentials=None, - use_bqstorage_api=None, + use_bqstorage_api: Optional[bool] = None, private_key=None, verbose=None, -): + progress_bar_type: Optional[str] = None, +) -> "DataFrame": """ Load data from Google BigQuery. @@ -50,10 +56,10 @@ def read_gbq( col_order : list(str), optional List of BigQuery column names in the desired order for results DataFrame. - reauth : boolean, default False + reauth : bool, default False Force Google BigQuery to re-authenticate the user. This is useful if multiple accounts are used. - auth_local_webserver : boolean, default False + auth_local_webserver : bool, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. @@ -64,7 +70,7 @@ def read_gbq( *New in version 0.2.0 of pandas-gbq*. dialect : str, default 'legacy' - Note: The default value is changing to 'standard' in a future verion. + Note: The default value is changing to 'standard' in a future version. SQL syntax dialect to use. Value can be one of: @@ -119,21 +125,30 @@ def read_gbq( ``fastavro`` packages. .. versionadded:: 0.25.0 - private_key : str, deprecated - Deprecated in pandas-gbq version 0.8.0. Use the ``credentials`` - parameter and - :func:`google.oauth2.service_account.Credentials.from_service_account_info` - or - :func:`google.oauth2.service_account.Credentials.from_service_account_file` - instead. - - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. Jupyter/IPython notebook on remote host). - verbose : None, deprecated - Deprecated in pandas-gbq version 0.4.0. Use the `logging module to - adjust verbosity instead - `__. + progress_bar_type : Optional, str + If set, use the `tqdm `__ library to + display a progress bar while the data downloads. Install the + ``tqdm`` package to use this feature. + + Possible values of ``progress_bar_type`` include: + + ``None`` + No progress bar. + ``'tqdm'`` + Use the :func:`tqdm.tqdm` function to print a progress bar + to :data:`sys.stderr`. + ``'tqdm_notebook'`` + Use the :func:`tqdm.tqdm_notebook` function to display a + progress bar as a Jupyter notebook widget. + ``'tqdm_gui'`` + Use the :func:`tqdm.tqdm_gui` function to display a + progress bar as a graphical dialog box. + + Note that his feature requires version 0.12.0 or later of the + ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly + different than ``pandas-gbq``, here the default is ``None``. + + .. versionadded:: 1.0.0 Returns ------- @@ -147,20 +162,15 @@ def read_gbq( """ pandas_gbq = _try_import() - kwargs = {} + kwargs: Dict[str, Union[str, bool]] = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: kwargs["use_bqstorage_api"] = use_bqstorage_api - # END: new kwargs - # START: deprecated kwargs. Don't populate unless explicitly set. - if verbose is not None: - kwargs["verbose"] = verbose - - if private_key is not None: - kwargs["private_key"] = private_key - # END: deprecated kwargs + if progress_bar_type is not None: + kwargs["progress_bar_type"] = progress_bar_type + # END: new kwargs return pandas_gbq.read_gbq( query, @@ -173,25 +183,25 @@ def read_gbq( location=location, configuration=configuration, credentials=credentials, - **kwargs + **kwargs, ) def to_gbq( - dataframe, - destination_table, - project_id=None, - chunksize=None, - reauth=False, - if_exists="fail", - auth_local_webserver=False, - table_schema=None, - location=None, - progress_bar=True, + dataframe: "DataFrame", + destination_table: str, + project_id: Optional[str] = None, + chunksize: Optional[int] = None, + reauth: bool = False, + if_exists: str = "fail", + auth_local_webserver: bool = False, + table_schema: Optional[List[Dict[str, str]]] = None, + location: Optional[str] = None, + progress_bar: bool = True, credentials=None, verbose=None, private_key=None, -): +) -> None: pandas_gbq = _try_import() pandas_gbq.to_gbq( dataframe, diff --git a/pandas/io/html.py b/pandas/io/html.py index 7da7a819f81e8..eafcca0e85bb3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,9 +14,9 @@ from pandas.core.dtypes.common import is_list_like -from pandas import Series +from pandas.core.construction import create_series_with_explicit_dtype -from pandas.io.common import _is_url, _validate_header_arg, urlopen +from pandas.io.common import is_url, urlopen, validate_header_arg from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser @@ -57,7 +57,7 @@ def _importers(): _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") -def _remove_whitespace(s, regex=_RE_WHITESPACE): +def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str: """ Replace extra whitespace inside of a string with a single space. @@ -65,8 +65,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): ---------- s : str or unicode The string from which to remove extra whitespace. - - regex : regex + regex : re.Pattern The regular expression to use to remove extra whitespace. Returns @@ -103,9 +102,7 @@ def _get_skiprows(skiprows): return skiprows elif skiprows is None: return 0 - raise TypeError( - "%r is not a valid type for skipping rows" % type(skiprows).__name__ - ) + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") def _read(obj): @@ -120,7 +117,7 @@ def _read(obj): ------- raw_text : str """ - if _is_url(obj): + if is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, "read"): @@ -134,7 +131,7 @@ def _read(obj): except (TypeError, ValueError): pass else: - raise TypeError("Cannot read object of type %r" % type(obj).__name__) + raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") return text @@ -253,7 +250,8 @@ def _text_getter(self, obj): raise AbstractMethodError(self) def _parse_td(self, obj): - """Return the td elements from a row element. + """ + Return the td elements from a row element. Parameters ---------- @@ -560,9 +558,7 @@ def _parse_tables(self, doc, match, attrs): unique_tables.add(table) if not result: - raise ValueError( - "No tables found matching pattern {patt!r}".format(patt=match.pattern) - ) + raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result def _text_getter(self, obj): @@ -589,7 +585,7 @@ def _parse_tfoot_tr(self, table): def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise ValueError("No text parsed from document: {doc}".format(doc=self.io)) + raise ValueError(f"No text parsed from document: {self.io}") return raw_text def _build_doc(self): @@ -600,7 +596,7 @@ def _build_doc(self): ) -def _build_xpath_expr(attrs): +def _build_xpath_expr(attrs) -> str: """Build an xpath expression to simulate bs4's ability to pass in kwargs to search for attributes when using the lxml parser. @@ -618,8 +614,8 @@ def _build_xpath_expr(attrs): if "class_" in attrs: attrs["class"] = attrs.pop("class_") - s = ["@{key}={val!r}".format(key=k, val=v) for k, v in attrs.items()] - return "[{expr}]".format(expr=" and ".join(s)) + s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) + return f"[{s}]" _re_namespace = {"re": "http://exslt.org/regular-expressions"} @@ -661,8 +657,7 @@ def _parse_tables(self, doc, match, kwargs): # 1. check all descendants for the given pattern and only search tables # 2. go up the tree until we find a table - query = "//table//*[re:test(text(), {patt!r})]/ancestor::table" - xpath_expr = query.format(patt=pattern) + xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table" # if any table attributes were given build an xpath expression to # search for them @@ -682,9 +677,7 @@ def _parse_tables(self, doc, match, kwargs): elem.getparent().remove(elem) if not tables: - raise ValueError( - "No tables found matching regex {patt!r}".format(patt=pattern) - ) + raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables def _equals_tag(self, obj, tag): @@ -712,7 +705,7 @@ def _build_doc(self): parser = HTMLParser(recover=True, encoding=self.encoding) try: - if _is_url(self.io): + if is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: @@ -724,7 +717,7 @@ def _build_doc(self): pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop - if not _is_url(self.io): + if not is_url(self.io): r = fromstring(self.io, parser=parser) try: @@ -767,7 +760,8 @@ def _parse_tfoot_tr(self, table): def _expand_elements(body): - lens = Series([len(elem) for elem in body]) + data = [len(elem) for elem in body] + lens = create_series_with_explicit_dtype(data, dtype_if_empty=object) lens_max = lens.max() not_max = lens[lens != lens_max] @@ -810,7 +804,8 @@ def _data_to_frame(**kwargs): def _parser_dispatch(flavor): - """Choose the parser based on the input flavor. + """ + Choose the parser based on the input flavor. Parameters ---------- @@ -832,8 +827,7 @@ def _parser_dispatch(flavor): valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: raise ValueError( - "{invalid!r} is not a valid flavor, valid flavors " - "are {valid}".format(invalid=flavor, valid=valid_parsers) + f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" ) if flavor in ("bs4", "html5lib"): @@ -850,8 +844,9 @@ def _parser_dispatch(flavor): return _valid_parsers[flavor] -def _print_as_set(s): - return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}" +def _print_as_set(s) -> str: + arg = ", ".join(pprint_thing(el) for el in s) + return f"{{{arg}}}" def _validate_flavor(flavor): @@ -862,13 +857,13 @@ def _validate_flavor(flavor): elif isinstance(flavor, abc.Iterable): if not all(isinstance(flav, str) for flav in flavor): raise TypeError( - "Object of type {typ!r} is not an iterable of " - "strings".format(typ=type(flavor).__name__) + f"Object of type {repr(type(flavor).__name__)} " + f"is not an iterable of strings" ) else: - fmt = "{flavor!r}" if isinstance(flavor, str) else "{flavor}" - fmt += " is not a valid flavor" - raise ValueError(fmt.format(flavor=flavor)) + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) flavor = tuple(flavor) valid_flavors = set(_valid_parsers) @@ -876,10 +871,8 @@ def _validate_flavor(flavor): if not flavor_set & valid_flavors: raise ValueError( - "{invalid} is not a valid set of flavors, valid " - "flavors are {valid}".format( - invalid=_print_as_set(flavor_set), valid=_print_as_set(valid_flavors) - ) + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" ) return flavor @@ -895,7 +888,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): try: tables = p.parse_tables() - except Exception as caught: + except ValueError as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser if hasattr(io, "seekable") and io.seekable(): @@ -903,11 +896,11 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): elif hasattr(io, "seekable") and not io.seekable(): # if we couldn't rewind it, let the user know raise ValueError( - "The flavor {} failed to parse your input. " + f"The flavor {flav} failed to parse your input. " "Since you passed a non-rewindable file " "object, we can't rewind it to try " "another parser. Try read_html() with a " - "different flavor.".format(flav) + "different flavor." ) retained = caught @@ -960,7 +953,7 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str or None, container of strings + flavor : str or None The parsing engine to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it @@ -974,7 +967,7 @@ def read_html( The column (or list of columns) to use to create the index. skiprows : int or list-like or slice or None, optional - 0-based. Number of rows to skip after parsing the column integer. If a + Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth row' whereas an integer means 'skip n rows'. @@ -1024,18 +1017,19 @@ def read_html( transformed content. na_values : iterable, default None - Custom NA values + Custom NA values. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN - values are overridden, otherwise they're appended to + values are overridden, otherwise they're appended to. displayed_only : bool, default True - Whether elements with "display: none" should be parsed + Whether elements with "display: none" should be parsed. Returns ------- - dfs : list of DataFrames + dfs + A list of DataFrames. See Also -------- @@ -1082,7 +1076,7 @@ def read_html( "cannot skip rows starting from the end of the " "data (you passed a negative value)" ) - _validate_header_arg(header) + validate_header_arg(header) return _parse( flavor=flavor, io=io, diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py index 2382d993df96b..48febb086c302 100644 --- a/pandas/io/json/__init__.py +++ b/pandas/io/json/__init__.py @@ -1,5 +1,5 @@ from pandas.io.json._json import dumps, loads, read_json, to_json -from pandas.io.json._normalize import json_normalize +from pandas.io.json._normalize import _json_normalize, json_normalize from pandas.io.json._table_schema import build_table_schema __all__ = [ @@ -7,6 +7,7 @@ "loads", "read_json", "to_json", + "_json_normalize", "json_normalize", "build_table_schema", ] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 6e9e0a0b01200..12ce5e4a62d24 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,4 +1,5 @@ -from collections import OrderedDict +from collections import abc +import functools from io import StringIO from itertools import islice import os @@ -8,27 +9,26 @@ import pandas._libs.json as json from pandas._libs.tslibs import iNaT +from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ensure_str, is_period_dtype -from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime -from pandas._typing import JSONSerializable +from pandas import DataFrame, MultiIndex, Series, isna, to_datetime +from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat from pandas.io.common import ( - BaseIterator, - _get_handle, - _infer_compression, - _stringify_path, get_filepath_or_buffer, + get_handle, + infer_compression, + stringify_path, ) -from pandas.io.formats.printing import pprint_thing +from pandas.io.json._normalize import convert_to_line_delimits +from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer -from ._normalize import convert_to_line_delimits -from ._table_schema import build_table_schema, parse_table_schema - loads = json.loads dumps = json.dumps @@ -53,17 +53,19 @@ def to_json( if not index and orient not in ["split", "table"]: raise ValueError( - "'index=False' is only valid when 'orient' is " "'split' or 'table'" + "'index=False' is only valid when 'orient' is 'split' or 'table'" ) - path_or_buf = _stringify_path(path_or_buf) + path_or_buf = stringify_path(path_or_buf) if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") + + writer: Type["Writer"] if orient == "table" and isinstance(obj, DataFrame): - writer = JSONTableWriter # type: Type["Writer"] + writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): @@ -87,7 +89,7 @@ def to_json( s = convert_to_line_delimits(s) if isinstance(path_or_buf, str): - fh, handles = _get_handle(path_or_buf, "w", compression=compression) + fh, handles = get_handle(path_or_buf, "w", compression=compression) try: fh.write(s) finally: @@ -171,10 +173,7 @@ class SeriesWriter(Writer): def _format_axes(self): if not self.obj.index.is_unique and self.orient == "index": - raise ValueError( - "Series index must be unique for orient=" - "'{orient}'".format(orient=self.orient) - ) + raise ValueError(f"Series index must be unique for orient='{self.orient}'") def _write( self, @@ -210,8 +209,7 @@ def _format_axes(self): """ if not self.obj.index.is_unique and self.orient in ("index", "columns"): raise ValueError( - "DataFrame index must be unique for orient=" - "'{orient}'.".format(orient=self.orient) + f"DataFrame index must be unique for orient='{self.orient}'." ) if not self.obj.columns.is_unique and self.orient in ( "index", @@ -219,8 +217,7 @@ def _format_axes(self): "records", ): raise ValueError( - "DataFrame columns must be unique for orient=" - "'{orient}'.".format(orient=self.orient) + f"DataFrame columns must be unique for orient='{self.orient}'." ) def _write( @@ -286,8 +283,8 @@ def __init__( if date_format != "iso": msg = ( "Trying to write with `orient='table'` and " - "`date_format='{fmt}'`. Table Schema requires dates " - "to be formatted with `date_format='iso'`".format(fmt=date_format) + f"`date_format='{date_format}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`" ) raise ValueError(msg) @@ -310,7 +307,7 @@ def __init__( timedeltas = obj.select_dtypes(include=["timedelta"]).columns if len(timedeltas): obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) - # Convert PeriodIndex to datetimes before serialzing + # Convert PeriodIndex to datetimes before serializing if is_period_dtype(obj.index): obj.index = obj.index.to_timestamp() @@ -334,7 +331,7 @@ def _write( default_handler, indent, ): - table_obj = OrderedDict((("schema", self.schema), ("data", obj))) + table_obj = {"schema": self.schema, "data": obj} serialized = super()._write( table_obj, orient, @@ -349,6 +346,7 @@ def _write( return serialized +@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) def read_json( path_or_buf=None, orient=None, @@ -462,6 +460,8 @@ def read_json( non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. + .. deprecated:: 1.0.0 + precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but @@ -482,7 +482,7 @@ def read_json( chunksize : int, optional Return JsonReader object for iteration. See the `line-delimited json docs - `_ + `_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. @@ -577,8 +577,10 @@ def read_json( dtype = True if convert_axes is None and orient != "table": convert_axes = True + if encoding is None: + encoding = "utf-8" - compression = _infer_compression(path_or_buf, compression) + compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression ) @@ -610,7 +612,7 @@ def read_json( return result -class JsonReader(BaseIterator): +class JsonReader(abc.Iterator): """ JsonReader provides an interface for reading in a JSON file. @@ -698,7 +700,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): pass if exists or self.compression is not None: - data, _ = _get_handle( + data, _ = get_handle( filepath_or_buffer, "r", encoding=self.encoding, @@ -709,7 +711,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): return data - def _combine_lines(self, lines): + def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. """ @@ -822,9 +824,7 @@ def __init__( if date_unit is not None: date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: - raise ValueError( - "date_unit must be one of {units}".format(units=self._STAMP_UNITS) - ) + raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}") self.min_stamp = self._MIN_STAMPS[date_unit] else: self.min_stamp = self._MIN_STAMPS["s"] @@ -844,11 +844,7 @@ def check_keys_split(self, decoded): bad_keys = set(decoded.keys()).difference(set(self._split_keys)) if bad_keys: bad_keys = ", ".join(bad_keys) - raise ValueError( - "JSON data had unexpected key(s): {bad_keys}".format( - bad_keys=pprint_thing(bad_keys) - ) - ) + raise ValueError(f"JSON data had unexpected key(s): {bad_keys}") def parse(self): @@ -1002,44 +998,34 @@ class SeriesParser(Parser): _split_keys = ("name", "index", "data") def _parse_no_numpy(self): + data = loads(self.json, precise_float=self.precise_float) - json = self.json - orient = self.orient - if orient == "split": - decoded = { - str(k): v - for k, v in loads(json, precise_float=self.precise_float).items() - } + if self.orient == "split": + decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(dtype=None, **decoded) + self.obj = create_series_with_explicit_dtype(**decoded) else: - self.obj = Series(loads(json, precise_float=self.precise_float), dtype=None) + self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) def _parse_numpy(self): + load_kwargs = { + "dtype": None, + "numpy": True, + "precise_float": self.precise_float, + } + if self.orient in ["columns", "index"]: + load_kwargs["labelled"] = True + loads_ = functools.partial(loads, **load_kwargs) + data = loads_(self.json) - json = self.json - orient = self.orient - if orient == "split": - decoded = loads( - json, dtype=None, numpy=True, precise_float=self.precise_float - ) - decoded = {str(k): v for k, v in decoded.items()} + if self.orient == "split": + decoded = {str(k): v for k, v in data.items()} self.check_keys_split(decoded) - self.obj = Series(**decoded) - elif orient == "columns" or orient == "index": - self.obj = Series( - *loads( - json, - dtype=None, - numpy=True, - labelled=True, - precise_float=self.precise_float, - ) - ) + self.obj = create_series_with_explicit_dtype(**decoded) + elif self.orient in ["columns", "index"]: + self.obj = create_series_with_explicit_dtype(*data, dtype_if_empty=object) else: - self.obj = Series( - loads(json, dtype=None, numpy=True, precise_float=self.precise_float) - ) + self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) def _try_convert_types(self): if self.obj is None: @@ -1115,8 +1101,6 @@ def _parse_no_numpy(self): dtype=None, orient="index", ) - if compat.PY35: - self.obj = self.obj.sort_index(axis="columns").sort_index(axis="index") elif orient == "table": self.obj = parse_table_schema(json, precise_float=self.precise_float) else: @@ -1169,7 +1153,7 @@ def _try_convert_dates(self): convert_dates = [] convert_dates = set(convert_dates) - def is_ok(col): + def is_ok(col) -> bool: """ Return if this col is ok to try for a date parse. """ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 702241bde2b34..c0596c984575a 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -3,12 +3,14 @@ from collections import defaultdict import copy -from typing import DefaultDict, Dict, List, Optional, Union +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union import numpy as np from pandas._libs.writers import convert_json_to_lines +from pandas.util._decorators import deprecate +import pandas as pd from pandas import DataFrame @@ -108,16 +110,16 @@ def nested_to_record( return new_ds -def json_normalize( +def _json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, -): +) -> "DataFrame": """ Normalize semi-structured JSON data into a flat table. @@ -228,14 +230,23 @@ def json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js, spec): - result = js + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + return result if isinstance(data, list) and not data: @@ -264,21 +275,21 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - meta = [m if isinstance(m, list) else [m] for m in meta] + _meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now - records = [] # type: List + records: List = [] lengths = [] - meta_vals = defaultdict(list) # type: DefaultDict - meta_keys = [sep.join(val) for val in meta] + meta_vals: DefaultDict = defaultdict(list) + meta_keys = [sep.join(val) for val in _meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) @@ -295,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: @@ -308,7 +319,7 @@ def _recursive_extract(data, path, seen_meta, level=0): raise KeyError( "Try running with " "errors='ignore' as key " - "{err} is not always present".format(err=e) + f"{e} is not always present" ) meta_vals[key].append(meta_val) records.extend(recs) @@ -318,7 +329,7 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) + result = result.rename(columns=lambda x: f"{record_prefix}{x}") # Data types, a problem for k, v in meta_vals.items(): @@ -327,8 +338,12 @@ def _recursive_extract(data, path, seen_meta, level=0): if k in result: raise ValueError( - "Conflicting metadata name {name}, " - "need distinguishing prefix ".format(name=k) + f"Conflicting metadata name {k}, need distinguishing prefix " ) result[k] = np.array(v, dtype=object).repeat(lengths) return result + + +json_normalize = deprecate( + "pandas.io.json.json_normalize", _json_normalize, "1.0.0", "pandas.json_normalize" +) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 1e27421a55499..5f23b95c10f8e 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,9 +18,9 @@ is_string_dtype, is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import DataFrame -from pandas.api.types import CategoricalDtype import pandas.core.common as com loads = json.loads @@ -81,15 +81,13 @@ def set_default_names(data): if len(nms) == 1 and data.index.name == "index": warnings.warn("Index name of 'index' is not round-trippable") elif len(nms) > 1 and any(x.startswith("level_") for x in nms): - warnings.warn( - "Index names beginning with 'level_' are not " "round-trippable" - ) + warnings.warn("Index names beginning with 'level_' are not round-trippable") return data data = data.copy() if data.index.nlevels > 1: names = [ - name if name is not None else "level_{}".format(i) + name if name is not None else f"level_{i}" for i, name in enumerate(data.index.names) ] data.index.names = names @@ -175,7 +173,7 @@ def convert_json_field_to_pandas_type(field): return "timedelta64" elif typ == "datetime": if field.get("tz"): - return "datetime64[ns, {tz}]".format(tz=field["tz"]) + return f"datetime64[ns, {field['tz']}]" else: return "datetime64[ns]" elif typ == "any": @@ -186,7 +184,7 @@ def convert_json_field_to_pandas_type(field): else: return "object" - raise ValueError("Unsupported or invalid field type: {}".format(typ)) + raise ValueError(f"Unsupported or invalid field type: {typ}") def build_table_schema(data, index=True, primary_key=None, version=True): @@ -317,12 +315,12 @@ def parse_table_schema(json, precise_float): # Cannot directly use as_type with timezone data on object; raise for now if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone ' "data") + raise NotImplementedError('table="orient" can not yet read timezone data') # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( - 'table="orient" can not yet read ' "ISO-formatted Timedelta data" + 'table="orient" can not yet read ISO-formatted Timedelta data' ) df = df.astype(dtypes) diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py deleted file mode 100644 index 11407c8282660..0000000000000 --- a/pandas/io/msgpack/__init__.py +++ /dev/null @@ -1,56 +0,0 @@ -# coding: utf-8 - -from collections import namedtuple - -from pandas.io.msgpack.exceptions import * # noqa: F401,F403 isort:skip -from pandas.io.msgpack._version import version # noqa: F401 isort:skip - - -class ExtType(namedtuple("ExtType", "code data")): - """ExtType represents ext type in msgpack.""" - - def __new__(cls, code, data): - if not isinstance(code, int): - raise TypeError("code must be int") - if not isinstance(data, bytes): - raise TypeError("data must be bytes") - if not 0 <= code <= 127: - raise ValueError("code must be 0~127") - return super().__new__(cls, code, data) - - -import os # noqa: F401,E402 isort:skip - -from pandas.io.msgpack._unpacker import ( # noqa: F401,E402 isort:skip - Unpacker, - unpack, - unpackb, -) -from pandas.io.msgpack._packer import Packer # noqa: E402 isort:skip - - -def pack(o, stream, **kwargs): - """ - Pack object `o` and write it to `stream` - - See :class:`Packer` for options. - """ - packer = Packer(**kwargs) - stream.write(packer.pack(o)) - - -def packb(o, **kwargs): - """ - Pack object `o` and return packed bytes - - See :class:`Packer` for options. - """ - return Packer(**kwargs).pack(o) - - -# alias for compatibility to json/marshal/pickle. -load = unpack -loads = unpackb - -dump = pack -dumps = packb diff --git a/pandas/io/msgpack/_packer.pyi b/pandas/io/msgpack/_packer.pyi deleted file mode 100644 index e95a1622c5615..0000000000000 --- a/pandas/io/msgpack/_packer.pyi +++ /dev/null @@ -1,22 +0,0 @@ -# flake8: noqa - -class Packer: - def __cinit__(self): ... - def __init__( - self, - default=..., - encoding=..., - unicode_errors=..., - use_single_float=..., - autoreset: int = ..., - use_bin_type: int = ..., - ): ... - def __dealloc__(self): ... - def _pack(self, o, nest_limit: int = ...) -> int: ... - def pack(self, obj): ... - def pack_ext_type(self, typecode, data): ... - def pack_array_header(self, size): ... - def pack_map_header(self, size): ... - def pack_map_pairs(self, pairs): ... - def reset(self) -> None: ... - def bytes(self): ... diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx deleted file mode 100644 index 7c7c8f7b61e60..0000000000000 --- a/pandas/io/msgpack/_packer.pyx +++ /dev/null @@ -1,312 +0,0 @@ -# coding: utf-8 -# cython: embedsignature=True - -from cpython.bytes cimport (PyBytes_Check, PyBytes_AsString, - PyBytes_FromStringAndSize) -from cpython.dict cimport PyDict_Check, PyDict_CheckExact -from cpython.float cimport PyFloat_Check -from cpython.int cimport PyInt_Check -from cpython.list cimport PyList_Check -from cpython.long cimport PyLong_Check -from cpython.object cimport PyCallable_Check -from cpython.tuple cimport PyTuple_Check -from cpython.unicode cimport PyUnicode_Check, PyUnicode_AsEncodedString - -from libc.stdlib cimport free, malloc - -from pandas.io.msgpack.exceptions import PackValueError -from pandas.io.msgpack import ExtType -import numpy as np - - -cdef extern from "../../src/msgpack/pack.h": - struct msgpack_packer: - char* buf - size_t length - size_t buf_size - bint use_bin_type - - int msgpack_pack_int(msgpack_packer* pk, int d) - int msgpack_pack_nil(msgpack_packer* pk) - int msgpack_pack_true(msgpack_packer* pk) - int msgpack_pack_false(msgpack_packer* pk) - int msgpack_pack_long(msgpack_packer* pk, long d) - int msgpack_pack_long_long(msgpack_packer* pk, long long d) - int msgpack_pack_unsigned_long_long(msgpack_packer* pk, - unsigned long long d) - int msgpack_pack_float(msgpack_packer* pk, float d) - int msgpack_pack_double(msgpack_packer* pk, double d) - int msgpack_pack_array(msgpack_packer* pk, size_t l) - int msgpack_pack_map(msgpack_packer* pk, size_t l) - int msgpack_pack_raw(msgpack_packer* pk, size_t l) - int msgpack_pack_bin(msgpack_packer* pk, size_t l) - int msgpack_pack_raw_body(msgpack_packer* pk, char* body, size_t l) - int msgpack_pack_ext(msgpack_packer* pk, char typecode, size_t l) - -cdef int DEFAULT_RECURSE_LIMIT=511 - - -cdef class Packer: - """ - MessagePack Packer - - usage:: - - packer = Packer() - astream.write(packer.pack(a)) - astream.write(packer.pack(b)) - - Packer's constructor has some keyword arguments: - - :param callable default: - Convert user type to builtin type that Packer supports. - :param str encoding: - Convert unicode to bytes with this encoding. (default: 'utf-8') - :param str unicode_errors: - Error handler for encoding unicode. (default: 'strict') - :param bool use_single_float: - Use single precision float type for float. (default: False) - :param bool autoreset: - Reset buffer after each pack and return it's - content as `bytes`. (default: True). - If set this to false, use `bytes()` to get - content and `.reset()` to clear buffer. - :param bool use_bin_type: - Use bin type introduced in msgpack spec 2.0 for bytes. - It also enable str8 type for unicode. - """ - cdef: - msgpack_packer pk - object _default - object _bencoding - object _berrors - char *encoding - char *unicode_errors - bint use_float - bint autoreset - - def __cinit__(self): - cdef int buf_size = 1024 * 1024 - self.pk.buf = malloc(buf_size) - if self.pk.buf == NULL: - raise MemoryError("Unable to allocate internal buffer.") - self.pk.buf_size = buf_size - self.pk.length = 0 - - def __init__(self, default=None, encoding='utf-8', - unicode_errors='strict', use_single_float=False, - bint autoreset=1, bint use_bin_type=0): - """ - """ - self.use_float = use_single_float - self.autoreset = autoreset - self.pk.use_bin_type = use_bin_type - if default is not None: - if not PyCallable_Check(default): - raise TypeError("default must be a callable.") - self._default = default - if encoding is None: - self.encoding = NULL - self.unicode_errors = NULL - else: - if isinstance(encoding, unicode): - self._bencoding = encoding.encode('ascii') - else: - self._bencoding = encoding - self.encoding = PyBytes_AsString(self._bencoding) - if isinstance(unicode_errors, unicode): - self._berrors = unicode_errors.encode('ascii') - else: - self._berrors = unicode_errors - self.unicode_errors = PyBytes_AsString(self._berrors) - - def __dealloc__(self): - free(self.pk.buf); - - cdef int _pack(self, object o, - int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: - cdef: - long long llval - unsigned long long ullval - long longval - float fval - double dval - char* rawval - int ret - dict d - size_t L - int default_used = 0 - - if nest_limit < 0: - raise PackValueError("recursion limit exceeded.") - - while True: - if o is None: - ret = msgpack_pack_nil(&self.pk) - elif isinstance(o, (bool, np.bool_)): - if o: - ret = msgpack_pack_true(&self.pk) - else: - ret = msgpack_pack_false(&self.pk) - elif PyLong_Check(o): - # PyInt_Check(long) is True for Python 3. - # Sow we should test long before int. - if o > 0: - ullval = o - ret = msgpack_pack_unsigned_long_long(&self.pk, ullval) - else: - llval = o - ret = msgpack_pack_long_long(&self.pk, llval) - elif PyInt_Check(o): - longval = o - ret = msgpack_pack_long(&self.pk, longval) - elif PyFloat_Check(o): - if self.use_float: - fval = o - ret = msgpack_pack_float(&self.pk, fval) - else: - dval = o - ret = msgpack_pack_double(&self.pk, dval) - elif PyBytes_Check(o): - L = len(o) - if L > (2**32) - 1: - raise ValueError("bytes is too large") - rawval = o - ret = msgpack_pack_bin(&self.pk, L) - if ret == 0: - ret = msgpack_pack_raw_body(&self.pk, rawval, L) - elif PyUnicode_Check(o): - if not self.encoding: - raise TypeError("Can't encode unicode string: " - "no encoding is specified") - o = PyUnicode_AsEncodedString(o, self.encoding, - self.unicode_errors) - L = len(o) - if L > (2**32) - 1: - raise ValueError("dict is too large") - rawval = o - ret = msgpack_pack_raw(&self.pk, len(o)) - if ret == 0: - ret = msgpack_pack_raw_body(&self.pk, rawval, len(o)) - elif PyDict_CheckExact(o): - d = o - L = len(d) - if L > (2**32) - 1: - raise ValueError("dict is too large") - ret = msgpack_pack_map(&self.pk, L) - if ret == 0: - for k, v in d.items(): - ret = self._pack(k, nest_limit - 1) - if ret != 0: break - ret = self._pack(v, nest_limit - 1) - if ret != 0: break - elif PyDict_Check(o): - L = len(o) - if L > (2**32) - 1: - raise ValueError("dict is too large") - ret = msgpack_pack_map(&self.pk, L) - if ret == 0: - for k, v in o.items(): - ret = self._pack(k, nest_limit - 1) - if ret != 0: break - ret = self._pack(v, nest_limit - 1) - if ret != 0: break - elif isinstance(o, ExtType): - # This should be before Tuple because ExtType is namedtuple. - longval = o.code - rawval = o.data - L = len(o.data) - if L > (2**32) - 1: - raise ValueError("EXT data is too large") - ret = msgpack_pack_ext(&self.pk, longval, L) - ret = msgpack_pack_raw_body(&self.pk, rawval, L) - elif PyTuple_Check(o) or PyList_Check(o): - L = len(o) - if L > (2**32) - 1: - raise ValueError("list is too large") - ret = msgpack_pack_array(&self.pk, L) - if ret == 0: - for v in o: - ret = self._pack(v, nest_limit - 1) - if ret != 0: break - elif not default_used and self._default: - o = self._default(o) - default_used = 1 - continue - else: - raise TypeError("can't serialize {thing!r}".format(thing=o)) - break - return ret - - cpdef pack(self, object obj): - cdef int ret - ret = self._pack(obj, DEFAULT_RECURSE_LIMIT) - if ret == -1: - raise MemoryError - elif ret: # should not happen. - raise TypeError - if self.autoreset: - buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) - self.pk.length = 0 - return buf - - def pack_ext_type(self, typecode, data): - msgpack_pack_ext(&self.pk, typecode, len(data)) - msgpack_pack_raw_body(&self.pk, data, len(data)) - - def pack_array_header(self, size_t size): - if size > (2**32) - 1: - raise ValueError - cdef int ret = msgpack_pack_array(&self.pk, size) - if ret == -1: - raise MemoryError - elif ret: # should not happen - raise TypeError - if self.autoreset: - buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) - self.pk.length = 0 - return buf - - def pack_map_header(self, size_t size): - if size > (2**32) - 1: - raise ValueError - cdef int ret = msgpack_pack_map(&self.pk, size) - if ret == -1: - raise MemoryError - elif ret: # should not happen - raise TypeError - if self.autoreset: - buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) - self.pk.length = 0 - return buf - - def pack_map_pairs(self, object pairs): - """ - Pack *pairs* as msgpack map type. - - *pairs* should sequence of pair. - (`len(pairs)` and `for k, v in pairs:` should be supported.) - """ - cdef int ret = msgpack_pack_map(&self.pk, len(pairs)) - if ret == 0: - for k, v in pairs: - ret = self._pack(k) - if ret != 0: break - ret = self._pack(v) - if ret != 0: break - if ret == -1: - raise MemoryError - elif ret: # should not happen - raise TypeError - if self.autoreset: - buf = PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) - self.pk.length = 0 - return buf - - def reset(self): - """Clear internal buffer.""" - self.pk.length = 0 - - def bytes(self): - """Return buffer content.""" - return PyBytes_FromStringAndSize(self.pk.buf, self.pk.length) diff --git a/pandas/io/msgpack/_unpacker.pyi b/pandas/io/msgpack/_unpacker.pyi deleted file mode 100644 index 9910895947fb6..0000000000000 --- a/pandas/io/msgpack/_unpacker.pyi +++ /dev/null @@ -1,59 +0,0 @@ -# flake8: noqa - -def unpackb( - packed, - object_hook=..., - list_hook=..., - use_list=..., - encoding=..., - unicode_errors=..., - object_pairs_hook=..., - ext_hook=..., - max_str_len=..., - max_bin_len=..., - max_array_len=..., - max_map_len=..., - max_ext_len=..., -): ... -def unpack( - stream, - object_hook=..., - list_hook=..., - use_list=..., - encoding=..., - unicode_errors=..., - object_pairs_hook=..., -): ... - -class Unpacker: - def __cinit__(self): ... - def __dealloc__(self): ... - def __init__( - self, - file_like=..., - read_size=..., - use_list=..., - object_hook=..., - object_pairs_hook=..., - list_hook=..., - encoding=..., - unicode_errors=..., - max_buffer_size: int = ..., - ext_hook=..., - max_str_len=..., - max_bin_len=..., - max_array_len=..., - max_map_len=..., - max_ext_len=..., - ): ... - def feed(self, next_bytes): ... - def append_buffer(self, _buf, _buf_len): ... - def read_from_file(self): ... - def _unpack(self, execute, write_bytes, iter=...): ... - def read_bytes(self, nbytes): ... - def unpack(self, write_bytes=...): ... - def skip(self, write_bytes=...): ... - def read_array_header(self, write_bytes=...): ... - def read_map_header(self, write_bytes=...): ... - def __iter__(self): ... - def __next__(self): ... diff --git a/pandas/io/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx deleted file mode 100644 index cf9b2c7c04d42..0000000000000 --- a/pandas/io/msgpack/_unpacker.pyx +++ /dev/null @@ -1,495 +0,0 @@ -# coding: utf-8 -# cython: embedsignature=True - -from cython cimport Py_ssize_t - -from cpython.buffer cimport (PyBUF_SIMPLE, PyObject_GetBuffer, - PyBuffer_Release, Py_buffer) -from cpython.bytes cimport (PyBytes_Size, PyBytes_AsString, - PyBytes_FromStringAndSize) -from cpython.object cimport PyCallable_Check - -cdef extern from "Python.h": - ctypedef struct PyObject - -from libc.stdlib cimport free, malloc -from libc.string cimport memcpy, memmove -from libc.limits cimport INT_MAX - -from pandas.io.msgpack.exceptions import (BufferFull, OutOfData, - UnpackValueError, ExtraData) -from pandas.io.msgpack import ExtType - - -cdef extern from "../../src/msgpack/unpack.h": - ctypedef struct msgpack_user: - bint use_list - PyObject* object_hook - bint has_pairs_hook # call object_hook with k-v pairs - PyObject* list_hook - PyObject* ext_hook - char *encoding - char *unicode_errors - Py_ssize_t max_str_len - Py_ssize_t max_bin_len - Py_ssize_t max_array_len - Py_ssize_t max_map_len - Py_ssize_t max_ext_len - - ctypedef struct unpack_context: - msgpack_user user - PyObject* obj - size_t count - - ctypedef int (*execute_fn)(unpack_context* ctx, const char* data, - size_t len, size_t* off) except? -1 - execute_fn unpack_construct - execute_fn unpack_skip - execute_fn read_array_header - execute_fn read_map_header - void unpack_init(unpack_context* ctx) - object unpack_data(unpack_context* ctx) - -cdef inline init_ctx(unpack_context *ctx, - object object_hook, object object_pairs_hook, - object list_hook, object ext_hook, - bint use_list, char* encoding, char* unicode_errors, - Py_ssize_t max_str_len, Py_ssize_t max_bin_len, - Py_ssize_t max_array_len, Py_ssize_t max_map_len, - Py_ssize_t max_ext_len): - unpack_init(ctx) - ctx.user.use_list = use_list - ctx.user.object_hook = ctx.user.list_hook = NULL - ctx.user.max_str_len = max_str_len - ctx.user.max_bin_len = max_bin_len - ctx.user.max_array_len = max_array_len - ctx.user.max_map_len = max_map_len - ctx.user.max_ext_len = max_ext_len - - if object_hook is not None and object_pairs_hook is not None: - raise TypeError("object_pairs_hook and object_hook " - "are mutually exclusive.") - - if object_hook is not None: - if not PyCallable_Check(object_hook): - raise TypeError("object_hook must be a callable.") - ctx.user.object_hook = object_hook - - if object_pairs_hook is None: - ctx.user.has_pairs_hook = False - else: - if not PyCallable_Check(object_pairs_hook): - raise TypeError("object_pairs_hook must be a callable.") - ctx.user.object_hook = object_pairs_hook - ctx.user.has_pairs_hook = True - - if list_hook is not None: - if not PyCallable_Check(list_hook): - raise TypeError("list_hook must be a callable.") - ctx.user.list_hook = list_hook - - if ext_hook is not None: - if not PyCallable_Check(ext_hook): - raise TypeError("ext_hook must be a callable.") - ctx.user.ext_hook = ext_hook - - ctx.user.encoding = encoding - ctx.user.unicode_errors = unicode_errors - - -def default_read_extended_type(typecode, data): - raise NotImplementedError("Cannot decode extended type " - "with typecode={code}".format(code=typecode)) - - -def unpackb(object packed, object object_hook=None, object list_hook=None, - bint use_list=1, encoding=None, unicode_errors="strict", - object_pairs_hook=None, ext_hook=ExtType, - Py_ssize_t max_str_len=2147483647, # 2**32-1 - Py_ssize_t max_bin_len=2147483647, - Py_ssize_t max_array_len=2147483647, - Py_ssize_t max_map_len=2147483647, - Py_ssize_t max_ext_len=2147483647): - """ - Unpack packed_bytes to object. Returns an unpacked object. - - Raises `ValueError` when `packed` contains extra bytes. - - See :class:`Unpacker` for options. - """ - cdef: - unpack_context ctx - size_t off = 0 - int ret - - char* buf - Py_ssize_t buf_len - char* cenc = NULL - char* cerr = NULL - Py_buffer view - bytes extra_bytes - - # GH#26769 Effectively re-implement deprecated PyObject_AsReadBuffer; - # based on https://xpra.org/trac/ticket/1884 - PyObject_GetBuffer(packed, &view, PyBUF_SIMPLE) - buf = view.buf - buf_len = view.len - - if encoding is not None: - if isinstance(encoding, unicode): - encoding = encoding.encode('ascii') - cenc = PyBytes_AsString(encoding) - - if unicode_errors is not None: - if isinstance(unicode_errors, unicode): - unicode_errors = unicode_errors.encode('ascii') - cerr = PyBytes_AsString(unicode_errors) - - init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, - use_list, cenc, cerr, - max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) - ret = unpack_construct(&ctx, buf, buf_len, &off) - if ret == 1: - obj = unpack_data(&ctx) - if off < buf_len: - extra_bytes = PyBytes_FromStringAndSize(buf + off, buf_len - off) - PyBuffer_Release(&view) - raise ExtraData(obj, extra_bytes) - PyBuffer_Release(&view) - return obj - else: - PyBuffer_Release(&view) - raise UnpackValueError("Unpack failed: error = {ret}".format(ret=ret)) - - -def unpack(object stream, object object_hook=None, object list_hook=None, - bint use_list=1, encoding=None, unicode_errors="strict", - object_pairs_hook=None, - ): - """ - Unpack an object from `stream`. - - Raises `ValueError` when `stream` has extra bytes. - - See :class:`Unpacker` for options. - """ - return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, - object_pairs_hook=object_pairs_hook, list_hook=list_hook, - encoding=encoding, unicode_errors=unicode_errors) - - -cdef class Unpacker: - """Streaming unpacker. - - arguments: - - :param file_like: - File-like object having `.read(n)` method. - If specified, unpacker reads serialized data from it and - :meth:`feed()` is not usable. - - :param int read_size: - Used as `file_like.read(read_size)`. (default: - `min(1024**2, max_buffer_size)`) - - :param bool use_list: - If true, unpack msgpack array to Python list. - Otherwise, unpack to Python tuple. (default: True) - - :param callable object_hook: - When specified, it should be callable. - Unpacker calls it with a dict argument after unpacking msgpack map. - - :param callable object_pairs_hook: - When specified, it should be callable. Unpacker calls it with a list - of key-value pairs after unpacking msgpack map. - - :param str encoding: - Encoding used for decoding msgpack raw. - If it is None (default), msgpack raw is deserialized to Python bytes. - - :param str unicode_errors: - Used for decoding msgpack raw with *encoding*. - (default: `'strict'`) - - :param int max_buffer_size: - Limits size of data waiting unpacked. 0 means system's - INT_MAX (default). Raises `BufferFull` exception when it - is insufficient. You should set this parameter when unpacking - data from untrasted source. - - :param int max_str_len: - Limits max length of str. (default: 2**31-1) - - :param int max_bin_len: - Limits max length of bin. (default: 2**31-1) - - :param int max_array_len: - Limits max length of array. (default: 2**31-1) - - :param int max_map_len: - Limits max length of map. (default: 2**31-1) - - - example of streaming deserialize from file-like object:: - - unpacker = Unpacker(file_like) - for o in unpacker: - process(o) - - example of streaming deserialize from socket:: - - unpacker = Unpacker() - while True: - buf = sock.recv(1024**2) - if not buf: - break - unpacker.feed(buf) - for o in unpacker: - process(o) - """ - cdef: - unpack_context ctx - char* buf - size_t buf_size, buf_head, buf_tail - object file_like - object file_like_read - Py_ssize_t read_size - # To maintain refcnt. - object object_hook, object_pairs_hook, list_hook, ext_hook - object encoding, unicode_errors - size_t max_buffer_size - - def __cinit__(self): - self.buf = NULL - - def __dealloc__(self): - free(self.buf) - self.buf = NULL - - def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, - object object_hook=None, object object_pairs_hook=None, - object list_hook=None, encoding=None, unicode_errors='strict', - int max_buffer_size=0, object ext_hook=ExtType, - Py_ssize_t max_str_len=2147483647, # 2**32-1 - Py_ssize_t max_bin_len=2147483647, - Py_ssize_t max_array_len=2147483647, - Py_ssize_t max_map_len=2147483647, - Py_ssize_t max_ext_len=2147483647): - cdef: - char *cenc=NULL, - char *cerr=NULL - - self.object_hook = object_hook - self.object_pairs_hook = object_pairs_hook - self.list_hook = list_hook - self.ext_hook = ext_hook - - self.file_like = file_like - if file_like: - self.file_like_read = file_like.read - if not PyCallable_Check(self.file_like_read): - raise TypeError("`file_like.read` must be a callable.") - if not max_buffer_size: - max_buffer_size = INT_MAX - if read_size > max_buffer_size: - raise ValueError("read_size should be less or " - "equal to max_buffer_size") - if not read_size: - read_size = min(max_buffer_size, 1024**2) - self.max_buffer_size = max_buffer_size - self.read_size = read_size - self.buf = malloc(read_size) - if self.buf == NULL: - raise MemoryError("Unable to allocate internal buffer.") - self.buf_size = read_size - self.buf_head = 0 - self.buf_tail = 0 - - if encoding is not None: - if isinstance(encoding, unicode): - self.encoding = encoding.encode('ascii') - elif isinstance(encoding, bytes): - self.encoding = encoding - else: - raise TypeError("encoding should be bytes or unicode") - cenc = PyBytes_AsString(self.encoding) - - if unicode_errors is not None: - if isinstance(unicode_errors, unicode): - self.unicode_errors = unicode_errors.encode('ascii') - elif isinstance(unicode_errors, bytes): - self.unicode_errors = unicode_errors - else: - raise TypeError("unicode_errors should be bytes or unicode") - cerr = PyBytes_AsString(self.unicode_errors) - - init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, - ext_hook, use_list, cenc, cerr, - max_str_len, max_bin_len, max_array_len, - max_map_len, max_ext_len) - - def feed(self, object next_bytes): - """Append `next_bytes` to internal buffer.""" - cdef Py_buffer pybuff - if self.file_like is not None: - raise AssertionError("unpacker.feed() is not be able " - "to use with `file_like`.") - PyObject_GetBuffer(next_bytes, &pybuff, PyBUF_SIMPLE) - try: - self.append_buffer(pybuff.buf, pybuff.len) - finally: - PyBuffer_Release(&pybuff) - - cdef append_buffer(self, void* _buf, Py_ssize_t _buf_len): - cdef: - char* buf = self.buf - char* new_buf - size_t head = self.buf_head - size_t tail = self.buf_tail - size_t buf_size = self.buf_size - size_t new_size - - if tail + _buf_len > buf_size: - if ((tail - head) + _buf_len) <= buf_size: - # move to front. - memmove(buf, buf + head, tail - head) - tail -= head - head = 0 - else: - # expand buffer. - new_size = (tail - head) + _buf_len - if new_size > self.max_buffer_size: - raise BufferFull - new_size = min(new_size * 2, self.max_buffer_size) - new_buf = malloc(new_size) - if new_buf == NULL: - # self.buf still holds old buffer and will be freed during - # obj destruction - raise MemoryError("Unable to enlarge internal buffer.") - memcpy(new_buf, buf + head, tail - head) - free(buf) - - buf = new_buf - buf_size = new_size - tail -= head - head = 0 - - memcpy(buf + tail, (_buf), _buf_len) - self.buf = buf - self.buf_head = head - self.buf_size = buf_size - self.buf_tail = tail + _buf_len - - cdef read_from_file(self): - # Assume self.max_buffer_size - (self.buf_tail - self.buf_head) >= 0 - next_bytes = self.file_like_read( - min(self.read_size, - (self.max_buffer_size - - (self.buf_tail - self.buf_head)))) - if next_bytes: - self.append_buffer(PyBytes_AsString(next_bytes), - PyBytes_Size(next_bytes)) - else: - self.file_like = None - - cdef object _unpack(self, execute_fn execute, - object write_bytes, bint iter=0): - cdef: - int ret - object obj - size_t prev_head - - if self.buf_head >= self.buf_tail and self.file_like is not None: - self.read_from_file() - - while 1: - prev_head = self.buf_head - if prev_head >= self.buf_tail: - if iter: - raise StopIteration("No more data to unpack.") - else: - raise OutOfData("No more data to unpack.") - - ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head) - if write_bytes is not None: - write_bytes(PyBytes_FromStringAndSize( - self.buf + prev_head, self.buf_head - prev_head)) - - if ret == 1: - obj = unpack_data(&self.ctx) - unpack_init(&self.ctx) - return obj - elif ret == 0: - if self.file_like is not None: - self.read_from_file() - continue - if iter: - raise StopIteration("No more data to unpack.") - else: - raise OutOfData("No more data to unpack.") - else: - raise ValueError("Unpack failed: error = {ret}" - .format(ret=ret)) - - def read_bytes(self, Py_ssize_t nbytes): - """Read a specified number of raw bytes from the stream""" - cdef size_t nread - - # Assume that self.buf_tail - self.buf_head >= 0 - nread = min((self.buf_tail - self.buf_head), nbytes) - ret = PyBytes_FromStringAndSize(self.buf + self.buf_head, nread) - self.buf_head += nread - if len(ret) < nbytes and self.file_like is not None: - ret += self.file_like.read(nbytes - len(ret)) - return ret - - def unpack(self, object write_bytes=None): - """Unpack one object - - If write_bytes is not None, it will be called with parts of the raw - message as it is unpacked. - - Raises `OutOfData` when there are no more bytes to unpack. - """ - return self._unpack(unpack_construct, write_bytes) - - def skip(self, object write_bytes=None): - """Read and ignore one object, returning None - - If write_bytes is not None, it will be called with parts of the raw - message as it is unpacked. - - Raises `OutOfData` when there are no more bytes to unpack. - """ - return self._unpack(unpack_skip, write_bytes) - - def read_array_header(self, object write_bytes=None): - """assuming the next object is an array, return its size n, such that - the next n unpack() calls will iterate over its contents. - - Raises `OutOfData` when there are no more bytes to unpack. - """ - return self._unpack(read_array_header, write_bytes) - - def read_map_header(self, object write_bytes=None): - """assuming the next object is a map, return its size n, such that the - next n * 2 unpack() calls will iterate over its key-value pairs. - - Raises `OutOfData` when there are no more bytes to unpack. - """ - return self._unpack(read_map_header, write_bytes) - - def __iter__(self): - return self - - def __next__(self): - return self._unpack(unpack_construct, None, 1) - - # for debug. - # def _buf(self): - # return PyString_FromStringAndSize(self.buf, self.buf_tail) - - # def _off(self): - # return self.buf_head diff --git a/pandas/io/msgpack/_version.py b/pandas/io/msgpack/_version.py deleted file mode 100644 index 2c1c96c0759a1..0000000000000 --- a/pandas/io/msgpack/_version.py +++ /dev/null @@ -1 +0,0 @@ -version = (0, 4, 6) diff --git a/pandas/io/msgpack/exceptions.py b/pandas/io/msgpack/exceptions.py deleted file mode 100644 index 40f5a8af8f583..0000000000000 --- a/pandas/io/msgpack/exceptions.py +++ /dev/null @@ -1,31 +0,0 @@ -class UnpackException(Exception): - pass - - -class BufferFull(UnpackException): - pass - - -class OutOfData(UnpackException): - pass - - -class UnpackValueError(UnpackException, ValueError): - pass - - -class ExtraData(ValueError): - def __init__(self, unpacked, extra): - self.unpacked = unpacked - self.extra = extra - - def __str__(self): - return "unpack(b) received extra data." - - -class PackException(Exception): - pass - - -class PackValueError(PackException, ValueError): - pass diff --git a/pandas/io/orc.py b/pandas/io/orc.py new file mode 100644 index 0000000000000..bbefe447cb7fe --- /dev/null +++ b/pandas/io/orc.py @@ -0,0 +1,57 @@ +""" orc compat """ + +import distutils +from typing import TYPE_CHECKING, List, Optional + +from pandas._typing import FilePathOrBuffer + +from pandas.io.common import get_filepath_or_buffer + +if TYPE_CHECKING: + from pandas import DataFrame + + +def read_orc( + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, +) -> "DataFrame": + """ + Load an ORC object from the file path, returning a DataFrame. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + path : str, path object or file-like object + Any valid string path is acceptable. The string could be a URL. Valid + URL schemes include http, ftp, s3, and file. For file URLs, a host is + expected. A local file could be: + ``file://localhost/path/to/table.orc``. + + If you want to pass in a path object, pandas accepts any + ``os.PathLike``. + + By file-like object, we refer to objects with a ``read()`` method, + such as a file handler (e.g. via builtin ``open`` function) + or ``StringIO``. + columns : list, default None + If not None, only these columns will be read from the file. + **kwargs + Any additional kwargs are passed to pyarrow. + + Returns + ------- + DataFrame + """ + + # we require a newer version of pyarrow than we support for parquet + import pyarrow + + if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": + raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + + import pyarrow.orc + + path, _, _, _ = get_filepath_or_buffer(path) + orc_file = pyarrow.orc.ORCFile(path) + result = orc_file.read(columns=columns, **kwargs).to_pandas() + return result diff --git a/pandas/io/packers.py b/pandas/io/packers.py deleted file mode 100644 index c0ace7996e1b9..0000000000000 --- a/pandas/io/packers.py +++ /dev/null @@ -1,865 +0,0 @@ -""" -Msgpack serializer support for reading and writing pandas data structures -to disk - -portions of msgpack_numpy package, by Lev Givon were incorporated -into this module (and tests_packers.py) - -License -======= - -Copyright (c) 2013, Lev Givon. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - -* Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. -* Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. -* Neither the name of Lev Givon nor the names of any - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -from datetime import date, datetime, timedelta -from io import BytesIO -import os -import warnings - -from dateutil.parser import parse -import numpy as np - -from pandas.compat._optional import import_optional_dependency -from pandas.errors import PerformanceWarning -from pandas.util._move import ( - BadMove as _BadMove, - move_into_mutable_buffer as _move_into_mutable_buffer, -) - -from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_datetime64tz_dtype, - is_object_dtype, - needs_i8_conversion, - pandas_dtype, -) - -from pandas import ( # noqa:F401 - Categorical, - CategoricalIndex, - DataFrame, - DatetimeIndex, - Float64Index, - Index, - Int64Index, - Interval, - IntervalIndex, - MultiIndex, - NaT, - Period, - PeriodIndex, - RangeIndex, - Series, - TimedeltaIndex, - Timestamp, -) -from pandas.core import internals -from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray -from pandas.core.arrays.sparse import BlockIndex, IntIndex -from pandas.core.generic import NDFrame -from pandas.core.internals import BlockManager, _safe_reshape, make_block - -from pandas.io.common import _stringify_path, get_filepath_or_buffer -from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker - -# until we can pass this into our conversion functions, -# this is pretty hacky -compressor = None - - -def to_msgpack(path_or_buf, *args, **kwargs): - """ - msgpack (serialize) object to input file path - - .. deprecated:: 0.25.0 - - to_msgpack is deprecated and will be removed in a future version. - It is recommended to use pyarrow for on-the-wire transmission of - pandas objects. - - Example pyarrow usage: - - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({'A': [1, 2, 3]}) - >>> context = pa.default_serialization_context() - >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() - - For documentation on pyarrow, see `here - `__. - - Parameters - ---------- - path_or_buf : string File path, buffer-like, or None - if None, return generated bytes - args : an object or objects to serialize - encoding : encoding for unicode objects - append : boolean whether to append to an existing msgpack - (default is False) - compress : type of compressor (zlib or blosc), default to None (no - compression) - """ - warnings.warn( - "to_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.\n" - "For a full example, check\n" - "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_msgpack.html", # noqa: E501 - FutureWarning, - stacklevel=3, - ) - - global compressor - compressor = kwargs.pop("compress", None) - append = kwargs.pop("append", None) - if append: - mode = "a+b" - else: - mode = "wb" - - def writer(fh): - for a in args: - fh.write(pack(a, **kwargs)) - - path_or_buf = _stringify_path(path_or_buf) - if isinstance(path_or_buf, str): - try: - with open(path_or_buf, mode) as fh: - writer(fh) - except FileNotFoundError: - msg = "File b'{}' does not exist".format(path_or_buf) - raise FileNotFoundError(msg) - elif path_or_buf is None: - buf = BytesIO() - writer(buf) - return buf.getvalue() - else: - writer(path_or_buf) - - -def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): - """ - Load msgpack pandas object from the specified - file path. - - .. deprecated:: 0.25.0 - - read_msgpack is deprecated and will be removed in a future version. - It is recommended to use pyarrow for on-the-wire transmission of - pandas objects. - - Parameters - ---------- - path_or_buf : str, path object or file-like object - Any valid string path is acceptable. The string could be a URL. Valid - URL schemes include http, ftp, s3, and file. For file URLs, a host is - expected. - - If you want to pass in a path object, pandas accepts any - ``os.PathLike``. - - By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) or - ``StringIO``. - encoding : Encoding for decoding msgpack str type - iterator : boolean, if True, return an iterator to the unpacker - (default is False) - - Returns - ------- - obj : same type as object stored in file - - Notes - ----- - read_msgpack is only guaranteed to be backwards compatible to pandas - 0.20.3. - """ - warnings.warn( - "The read_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.", - FutureWarning, - stacklevel=3, - ) - - path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) - if iterator: - return Iterator(path_or_buf) - - def read(fh): - unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs)) - if len(unpacked_obj) == 1: - return unpacked_obj[0] - - if should_close: - try: - path_or_buf.close() - except IOError: - pass - return unpacked_obj - - # see if we have an actual file - if isinstance(path_or_buf, str): - try: - with open(path_or_buf, "rb") as fh: - return read(fh) - except FileNotFoundError: - msg = "File b'{}' does not exist".format(path_or_buf) - raise FileNotFoundError(msg) - - if isinstance(path_or_buf, bytes): - # treat as a binary-like - fh = None - try: - fh = BytesIO(path_or_buf) - return read(fh) - finally: - if fh is not None: - fh.close() - elif hasattr(path_or_buf, "read") and callable(path_or_buf.read): - # treat as a buffer like - return read(path_or_buf) - - raise ValueError("path_or_buf needs to be a string file path or file-like") - - -dtype_dict = { - 21: np.dtype("M8[ns]"), - "datetime64[ns]": np.dtype("M8[ns]"), - "datetime64[us]": np.dtype("M8[us]"), - 22: np.dtype("m8[ns]"), - "timedelta64[ns]": np.dtype("m8[ns]"), - "timedelta64[us]": np.dtype("m8[us]"), - # this is platform int, which we need to remap to np.int64 - # for compat on windows platforms - 7: np.dtype("int64"), - "category": "category", -} - - -def dtype_for(t): - """ return my dtype mapping, whether number or name """ - if t in dtype_dict: - return dtype_dict[t] - return np.typeDict.get(t, t) - - -c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32} - -# windows (32 bit) compat -if hasattr(np, "float128"): - c2f_dict["complex256"] = np.float128 - - -def c2f(r, i, ctype_name): - """ - Convert strings to complex number instance with specified numpy type. - """ - - ftype = c2f_dict[ctype_name] - return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i)) - - -def convert(values): - """ convert the numpy values to a list """ - - dtype = values.dtype - - if is_categorical_dtype(values): - return values - - elif is_object_dtype(dtype): - return values.ravel().tolist() - - if needs_i8_conversion(dtype): - values = values.view("i8") - v = values.ravel() - - if compressor == "zlib": - zlib = import_optional_dependency( - "zlib", extra="zlib is required when `compress='zlib'`." - ) - - # return string arrays like they are - if dtype == np.object_: - return v.tolist() - - # convert to a bytes array - v = v.tostring() - return ExtType(0, zlib.compress(v)) - - elif compressor == "blosc": - blosc = import_optional_dependency( - "blosc", extra="zlib is required when `compress='blosc'`." - ) - - # return string arrays like they are - if dtype == np.object_: - return v.tolist() - - # convert to a bytes array - v = v.tostring() - return ExtType(0, blosc.compress(v, typesize=dtype.itemsize)) - - # ndarray (on original dtype) - return ExtType(0, v.tostring()) - - -def unconvert(values, dtype, compress=None): - - as_is_ext = isinstance(values, ExtType) and values.code == 0 - - if as_is_ext: - values = values.data - - if is_categorical_dtype(dtype): - return values - - elif is_object_dtype(dtype): - return np.array(values, dtype=object) - - dtype = pandas_dtype(dtype).base - - if not as_is_ext: - values = values.encode("latin1") - - if compress: - if compress == "zlib": - zlib = import_optional_dependency( - "zlib", extra="zlib is required when `compress='zlib'`." - ) - decompress = zlib.decompress - elif compress == "blosc": - blosc = import_optional_dependency( - "blosc", extra="zlib is required when `compress='blosc'`." - ) - decompress = blosc.decompress - else: - raise ValueError("compress must be one of 'zlib' or 'blosc'") - - try: - return np.frombuffer( - _move_into_mutable_buffer(decompress(values)), dtype=dtype - ) - except _BadMove as e: - # Pull the decompressed data off of the `_BadMove` exception. - # We don't just store this in the locals because we want to - # minimize the risk of giving users access to a `bytes` object - # whose data is also given to a mutable buffer. - values = e.args[0] - if len(values) > 1: - # The empty string and single characters are memoized in many - # string creating functions in the capi. This case should not - # warn even though we need to make a copy because we are only - # copying at most 1 byte. - warnings.warn( - "copying data after decompressing; this may mean that" - " decompress is caching its result", - PerformanceWarning, - ) - # fall through to copying `np.fromstring` - - # Copy the bytes into a numpy array. - buf = np.frombuffer(values, dtype=dtype) - buf = buf.copy() # required to not mutate the original data - buf.flags.writeable = True - return buf - - -def encode(obj): - """ - Data encoder - """ - tobj = type(obj) - if isinstance(obj, Index): - if isinstance(obj, RangeIndex): - return { - "typ": "range_index", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "start": obj._range.start, - "stop": obj._range.stop, - "step": obj._range.step, - } - elif isinstance(obj, PeriodIndex): - return { - "typ": "period_index", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "freq": getattr(obj, "freqstr", None), - "dtype": obj.dtype.name, - "data": convert(obj.asi8), - "compress": compressor, - } - elif isinstance(obj, DatetimeIndex): - tz = getattr(obj, "tz", None) - - # store tz info and data as UTC - if tz is not None: - tz = tz.zone - obj = obj.tz_convert("UTC") - return { - "typ": "datetime_index", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "dtype": obj.dtype.name, - "data": convert(obj.asi8), - "freq": getattr(obj, "freqstr", None), - "tz": tz, - "compress": compressor, - } - elif isinstance(obj, (IntervalIndex, IntervalArray)): - if isinstance(obj, IntervalIndex): - typ = "interval_index" - else: - typ = "interval_array" - return { - "typ": typ, - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "left": getattr(obj, "left", None), - "right": getattr(obj, "right", None), - "closed": getattr(obj, "closed", None), - } - elif isinstance(obj, MultiIndex): - return { - "typ": "multi_index", - "klass": obj.__class__.__name__, - "names": getattr(obj, "names", None), - "dtype": obj.dtype.name, - "data": convert(obj.values), - "compress": compressor, - } - else: - return { - "typ": "index", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "dtype": obj.dtype.name, - "data": convert(obj.values), - "compress": compressor, - } - - elif isinstance(obj, Categorical): - return { - "typ": "category", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "codes": obj.codes, - "categories": obj.categories, - "ordered": obj.ordered, - "compress": compressor, - } - - elif isinstance(obj, Series): - return { - "typ": "series", - "klass": obj.__class__.__name__, - "name": getattr(obj, "name", None), - "index": obj.index, - "dtype": obj.dtype.name, - "data": convert(obj.values), - "compress": compressor, - } - elif issubclass(tobj, NDFrame): - data = obj._data - if not data.is_consolidated(): - data = data.consolidate() - - # the block manager - return { - "typ": "block_manager", - "klass": obj.__class__.__name__, - "axes": data.axes, - "blocks": [ - { - "locs": b.mgr_locs.as_array, - "values": convert(b.values), - "shape": b.values.shape, - "dtype": b.dtype.name, - "klass": b.__class__.__name__, - "compress": compressor, - } - for b in data.blocks - ], - } - - elif ( - isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)) - or obj is NaT - ): - if isinstance(obj, Timestamp): - tz = obj.tzinfo - if tz is not None: - tz = tz.zone - freq = obj.freq - if freq is not None: - freq = freq.freqstr - return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz} - if obj is NaT: - return {"typ": "nat"} - elif isinstance(obj, np.timedelta64): - return {"typ": "timedelta64", "data": obj.view("i8")} - elif isinstance(obj, timedelta): - return { - "typ": "timedelta", - "data": (obj.days, obj.seconds, obj.microseconds), - } - elif isinstance(obj, np.datetime64): - return {"typ": "datetime64", "data": str(obj)} - elif isinstance(obj, datetime): - return {"typ": "datetime", "data": obj.isoformat()} - elif isinstance(obj, date): - return {"typ": "date", "data": obj.isoformat()} - raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj)) - elif isinstance(obj, Period): - return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr} - elif isinstance(obj, Interval): - return { - "typ": "interval", - "left": obj.left, - "right": obj.right, - "closed": obj.closed, - } - elif isinstance(obj, BlockIndex): - return { - "typ": "block_index", - "klass": obj.__class__.__name__, - "blocs": obj.blocs, - "blengths": obj.blengths, - "length": obj.length, - } - elif isinstance(obj, IntIndex): - return { - "typ": "int_index", - "klass": obj.__class__.__name__, - "indices": obj.indices, - "length": obj.length, - } - elif isinstance(obj, np.ndarray): - return { - "typ": "ndarray", - "shape": obj.shape, - "ndim": obj.ndim, - "dtype": obj.dtype.name, - "data": convert(obj), - "compress": compressor, - } - elif isinstance(obj, np.number): - if np.iscomplexobj(obj): - return { - "typ": "np_scalar", - "sub_typ": "np_complex", - "dtype": obj.dtype.name, - "real": np.real(obj).__repr__(), - "imag": np.imag(obj).__repr__(), - } - else: - return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()} - elif isinstance(obj, complex): - return { - "typ": "np_complex", - "real": np.real(obj).__repr__(), - "imag": np.imag(obj).__repr__(), - } - - return obj - - -def decode(obj): - """ - Decoder for deserializing numpy data types. - """ - - typ = obj.get("typ") - if typ is None: - return obj - elif typ == "timestamp": - freq = obj["freq"] if "freq" in obj else obj["offset"] - return Timestamp(obj["value"], tz=obj["tz"], freq=freq) - elif typ == "nat": - return NaT - elif typ == "period": - return Period(ordinal=obj["ordinal"], freq=obj["freq"]) - elif typ == "index": - dtype = dtype_for(obj["dtype"]) - data = unconvert(obj["data"], dtype, obj.get("compress")) - return Index(data, dtype=dtype, name=obj["name"]) - elif typ == "range_index": - return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"]) - elif typ == "multi_index": - dtype = dtype_for(obj["dtype"]) - data = unconvert(obj["data"], dtype, obj.get("compress")) - data = [tuple(x) for x in data] - return MultiIndex.from_tuples(data, names=obj["names"]) - elif typ == "period_index": - data = unconvert(obj["data"], np.int64, obj.get("compress")) - d = dict(name=obj["name"], freq=obj["freq"]) - freq = d.pop("freq", None) - return PeriodIndex(PeriodArray(data, freq), **d) - - elif typ == "datetime_index": - data = unconvert(obj["data"], np.int64, obj.get("compress")) - d = dict(name=obj["name"], freq=obj["freq"]) - result = DatetimeIndex(data, **d) - tz = obj["tz"] - - # reverse tz conversion - if tz is not None: - result = result.tz_localize("UTC").tz_convert(tz) - return result - - elif typ in ("interval_index", "interval_array"): - return globals()[obj["klass"]].from_arrays( - obj["left"], obj["right"], obj["closed"], name=obj["name"] - ) - elif typ == "category": - from_codes = globals()[obj["klass"]].from_codes - return from_codes( - codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"] - ) - - elif typ == "interval": - return Interval(obj["left"], obj["right"], obj["closed"]) - elif typ == "series": - dtype = dtype_for(obj["dtype"]) - index = obj["index"] - data = unconvert(obj["data"], dtype, obj["compress"]) - return Series(data, index=index, dtype=dtype, name=obj["name"]) - - elif typ == "block_manager": - axes = obj["axes"] - - def create_block(b): - values = _safe_reshape( - unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"] - ) - - # locs handles duplicate column names, and should be used instead - # of items; see GH 9618 - if "locs" in b: - placement = b["locs"] - else: - placement = axes[0].get_indexer(b["items"]) - - if is_datetime64tz_dtype(b["dtype"]): - assert isinstance(values, np.ndarray), type(values) - assert values.dtype == "M8[ns]", values.dtype - values = DatetimeArray(values, dtype=b["dtype"]) - - return make_block( - values=values, - klass=getattr(internals, b["klass"]), - placement=placement, - dtype=b["dtype"], - ) - - blocks = [create_block(b) for b in obj["blocks"]] - return globals()[obj["klass"]](BlockManager(blocks, axes)) - elif typ == "datetime": - return parse(obj["data"]) - elif typ == "datetime64": - return np.datetime64(parse(obj["data"])) - elif typ == "date": - return parse(obj["data"]).date() - elif typ == "timedelta": - return timedelta(*obj["data"]) - elif typ == "timedelta64": - return np.timedelta64(int(obj["data"])) - elif typ == "block_index": - return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) - elif typ == "int_index": - return globals()[obj["klass"]](obj["length"], obj["indices"]) - elif typ == "ndarray": - return unconvert( - obj["data"], np.typeDict[obj["dtype"]], obj.get("compress") - ).reshape(obj["shape"]) - elif typ == "np_scalar": - if obj.get("sub_typ") == "np_complex": - return c2f(obj["real"], obj["imag"], obj["dtype"]) - else: - dtype = dtype_for(obj["dtype"]) - try: - return dtype(obj["data"]) - except (ValueError, TypeError): - return dtype.type(obj["data"]) - elif typ == "np_complex": - return complex(obj["real"] + "+" + obj["imag"] + "j") - elif isinstance(obj, (dict, list, set)): - return obj - else: - return obj - - -def pack( - o, - default=encode, - encoding="utf-8", - unicode_errors="strict", - use_single_float=False, - autoreset=1, - use_bin_type=1, -): - """ - Pack an object and return the packed bytes. - """ - - return Packer( - default=default, - encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type, - ).pack(o) - - -def unpack( - packed, - object_hook=decode, - list_hook=None, - use_list=False, - encoding="utf-8", - unicode_errors="strict", - object_pairs_hook=None, - max_buffer_size=0, - ext_hook=ExtType, -): - """ - Unpack a packed object, return an iterator - Note: packed lists will be returned as tuples - """ - - return Unpacker( - packed, - object_hook=object_hook, - list_hook=list_hook, - use_list=use_list, - encoding=encoding, - unicode_errors=unicode_errors, - object_pairs_hook=object_pairs_hook, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook, - ) - - -class Packer(_Packer): - def __init__( - self, - default=encode, - encoding="utf-8", - unicode_errors="strict", - use_single_float=False, - autoreset=1, - use_bin_type=1, - ): - super().__init__( - default=default, - encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type, - ) - - -class Unpacker(_Unpacker): - def __init__( - self, - file_like=None, - read_size=0, - use_list=False, - object_hook=decode, - object_pairs_hook=None, - list_hook=None, - encoding="utf-8", - unicode_errors="strict", - max_buffer_size=0, - ext_hook=ExtType, - ): - super().__init__( - file_like=file_like, - read_size=read_size, - use_list=use_list, - object_hook=object_hook, - object_pairs_hook=object_pairs_hook, - list_hook=list_hook, - encoding=encoding, - unicode_errors=unicode_errors, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook, - ) - - -class Iterator: - """ manage the unpacking iteration, - close the file on completion """ - - def __init__(self, path, **kwargs): - self.path = path - self.kwargs = kwargs - - def __iter__(self): - - needs_closing = True - try: - - # see if we have an actual file - if isinstance(self.path, str): - - try: - path_exists = os.path.exists(self.path) - except TypeError: - path_exists = False - - if path_exists: - fh = open(self.path, "rb") - else: - fh = BytesIO(self.path) - - else: - - if not hasattr(self.path, "read"): - fh = BytesIO(self.path) - - else: - - # a file-like - needs_closing = False - fh = self.path - - unpacker = unpack(fh) - for o in unpacker: - yield o - finally: - if needs_closing: - fh.close() diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 69ee6583d12c8..3a686a1a3b122 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +from typing import Any, Dict, Optional from warnings import catch_warnings from pandas.compat._optional import import_optional_dependency @@ -10,7 +11,7 @@ from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url -def get_engine(engine): +def get_engine(engine: str) -> "BaseImpl": """ return our implementation """ if engine == "auto": @@ -35,21 +36,17 @@ def get_engine(engine): "support" ) - if engine not in ["pyarrow", "fastparquet"]: - raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") - if engine == "pyarrow": return PyArrowImpl() elif engine == "fastparquet": return FastParquetImpl() + raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") -class BaseImpl: - - api = None # module +class BaseImpl: @staticmethod - def validate_dataframe(df): + def validate_dataframe(df: DataFrame): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") @@ -65,7 +62,7 @@ def validate_dataframe(df): if not valid_names: raise ValueError("Index level names must be strings") - def write(self, df, path, compression, **kwargs): + def write(self, df: DataFrame, path, compression, **kwargs): raise AbstractMethodError(self) def read(self, path, columns=None, **kwargs): @@ -74,30 +71,33 @@ def read(self, path, columns=None, **kwargs): class PyArrowImpl(BaseImpl): def __init__(self): - pyarrow = import_optional_dependency( + import_optional_dependency( "pyarrow", extra="pyarrow is required for parquet support." ) import pyarrow.parquet + # import utils to register the pyarrow extension types + import pandas.core.arrays._arrow_utils # noqa + self.api = pyarrow def write( self, - df, + df: DataFrame, path, compression="snappy", coerce_timestamps="ms", - index=None, + index: Optional[bool] = None, partition_cols=None, - **kwargs + **kwargs, ): self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode="wb") - if index is None: - from_pandas_kwargs = {} - else: - from_pandas_kwargs = {"preserve_index": index} + from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} + if index is not None: + from_pandas_kwargs["preserve_index"] = index + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( @@ -106,7 +106,7 @@ def write( compression=compression, coerce_timestamps=coerce_timestamps, partition_cols=partition_cols, - **kwargs + **kwargs, ) else: self.api.parquet.write_table( @@ -114,7 +114,7 @@ def write( path, compression=compression, coerce_timestamps=coerce_timestamps, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -140,7 +140,13 @@ def __init__(self): self.api = fastparquet def write( - self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs + self, + df: DataFrame, + path, + compression="snappy", + index=None, + partition_cols=None, + **kwargs, ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: @@ -176,7 +182,7 @@ def write( compression=compression, write_index=index, partition_on=partition_cols, - **kwargs + **kwargs, ) def read(self, path, columns=None, **kwargs): @@ -199,19 +205,20 @@ def read(self, path, columns=None, **kwargs): def to_parquet( - df, + df: DataFrame, path, - engine="auto", + engine: str = "auto", compression="snappy", - index=None, + index: Optional[bool] = None, partition_cols=None, - **kwargs + **kwargs, ): """ Write a DataFrame to the parquet format. Parameters ---------- + df : DataFrame path : str File path or Root Directory path. Will be used as Root Directory path while writing a partitioned dataset. @@ -236,7 +243,7 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None + partition_cols : str or list, optional, default None Column names by which to partition the dataset Columns are partitioned in the order they are given @@ -245,6 +252,8 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] impl = get_engine(engine) return impl.write( df, @@ -252,11 +261,11 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, - **kwargs + **kwargs, ) -def read_parquet(path, engine="auto", columns=None, **kwargs): +def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. @@ -269,6 +278,10 @@ def read_parquet(path, engine="auto", columns=None, **kwargs): URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables`` If you want to pass in a path object, pandas accepts any ``os.PathLike``. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 058d65b9464ae..b4eb2fb1411d0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,10 +2,10 @@ Module contains tools for processing files into DataFrames or other objects """ -from collections import defaultdict +from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import BufferedIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -17,7 +17,9 @@ import pandas._libs.lib as lib import pandas._libs.ops as libops import pandas._libs.parsers as parsers +from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing +from pandas._typing import FilePathOrBuffer from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -34,6 +36,7 @@ is_categorical_dtype, is_dtype_equal, is_extension_array_dtype, + is_file_like, is_float, is_integer, is_integer_dtype, @@ -46,24 +49,23 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna -from pandas._typing import FilePathOrBuffer from pandas.core import algorithms from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame -from pandas.core.index import Index, MultiIndex, RangeIndex, ensure_index_from_sequences +from pandas.core.indexes.api import ( + Index, + MultiIndex, + RangeIndex, + ensure_index_from_sequences, +) from pandas.core.series import Series from pandas.core.tools import datetimes as tools from pandas.io.common import ( - _NA_VALUES, - BaseIterator, - UnicodeReader, - UTF8Recoder, - _get_handle, - _infer_compression, - _validate_header_arg, get_filepath_or_buffer, - is_file_like, + get_handle, + infer_compression, + validate_header_arg, ) from pandas.io.date_converters import generic_parser @@ -81,7 +83,7 @@ into chunks. Additional help can be found in the online docs for -`IO Tools `_. +`IO Tools `_. Parameters ---------- @@ -119,9 +121,9 @@ ``skip_blank_lines=True``, so ``header=0`` denotes the first line of data rather than the first line of the file. names : array-like, optional - List of column names to use. If file contains no header row, then you - should explicitly pass ``header=None``. Duplicates in this list are not - allowed. + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. index_col : int, str, sequence of int / str, or False, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a @@ -190,7 +192,7 @@ Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" - + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. @@ -269,7 +271,7 @@ chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs - `_ + `_ for more information on ``iterator`` and ``chunksize``. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and @@ -378,9 +380,7 @@ def _validate_integer(name, val, min_val=0): min_val : int Minimum allowed value (val < min_val will result in a ValueError) """ - msg = "'{name:s}' must be an integer >={min_val:d}".format( - name=name, min_val=min_val - ) + msg = f"'{name:s}' must be an integer >={min_val:d}" if val is not None: if is_float(val): @@ -395,25 +395,22 @@ def _validate_integer(name, val, min_val=0): def _validate_names(names): """ - Check if the `names` parameter contains duplicates. - - If duplicates are found, we issue a warning before returning. + Raise ValueError if the `names` parameter contains duplicates. Parameters ---------- names : array-like or None An array containing a list of the names used for the output DataFrame. - Returns - ------- - names : array-like or None - The original `names` parameter. + Raises + ------ + ValueError + If names are not unique. """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") - return names def _read(filepath_or_buffer: FilePathOrBuffer, kwds): @@ -424,7 +421,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): kwds["encoding"] = encoding compression = kwds.get("compression", "infer") - compression = _infer_compression(filepath_or_buffer, compression) + compression = infer_compression(filepath_or_buffer, compression) # TODO: get_filepath_or_buffer could return # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] @@ -491,7 +488,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): "cache_dates": True, "thousands": None, "comment": None, - "decimal": b".", + "decimal": ".", # 'engine': 'c', "parse_dates": False, "keep_date_col": False, @@ -525,8 +522,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} -_deprecated_defaults = {} # type: Dict[str, Any] -_deprecated_args = set() # type: Set[str] +_deprecated_defaults: Dict[str, Any] = {} +_deprecated_args: Set[str] = set() def _make_parser_function(name, default_sep=","): @@ -571,7 +568,7 @@ def parser_f( # Quoting, Compression, and File Format compression="infer", thousands=None, - decimal=b".", + decimal: str = ".", lineterminator=None, quotechar='"', quoting=csv.QUOTE_MINIMAL, @@ -614,9 +611,9 @@ def parser_f( if delim_whitespace and delimiter != default_sep: raise ValueError( - "Specified a delimiter with both sep and" - " delim_whitespace=True; you can only" - " specify one." + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only " + "specify one." ) if engine is not None: @@ -687,7 +684,7 @@ def parser_f( read_csv = Appender( _doc_read_csv_and_table.format( func_name="read_csv", - summary=("Read a comma-separated values (csv) file into DataFrame."), + summary="Read a comma-separated values (csv) file into DataFrame.", _default_sep="','", ) )(read_csv) @@ -707,7 +704,7 @@ def read_fwf( colspecs="infer", widths=None, infer_nrows=100, - **kwds + **kwds, ): r""" @@ -717,7 +714,7 @@ def read_fwf( into chunks. Additional help can be found in the `online docs for IO Tools - `_. + `_. Parameters ---------- @@ -785,7 +782,7 @@ def read_fwf( return _read(filepath_or_buffer, kwds) -class TextFileReader(BaseIterator): +class TextFileReader(abc.Iterator): """ Passed dialect overrides any of the related parser options @@ -822,11 +819,7 @@ def __init__(self, f, engine=None, **kwds): try: dialect_val = getattr(dialect, param) except AttributeError: - raise ValueError( - "Invalid dialect '{dialect}' provided".format( - dialect=kwds["dialect"] - ) - ) + raise ValueError(f"Invalid dialect {kwds['dialect']} provided") parser_default = _parser_defaults[param] provided = kwds.get(param, parser_default) @@ -838,11 +831,9 @@ def __init__(self, f, engine=None, **kwds): # even if it conflicts with the dialect (gh-23761). if provided != parser_default and provided != dialect_val: msg = ( - "Conflicting values for '{param}': '{val}' was " - "provided, but the dialect specifies '{diaval}'. " - "Using the dialect-specified value.".format( - param=param, val=provided, diaval=dialect_val - ) + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." ) # Annoying corner case for not warning about @@ -916,8 +907,8 @@ def _get_options_with_defaults(self, engine): pass else: raise ValueError( - "The %r option is not supported with the" - " %r engine" % (argname, engine) + f"The {repr(argname)} option is not supported with the" + f" {repr(engine)} engine" ) else: value = _deprecated_defaults.get(argname, default) @@ -964,8 +955,8 @@ def _clean_options(self, options, engine): if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( - "the 'c' engine does not support" - " sep=None with delim_whitespace=False" + "the 'c' engine does not support " + "sep=None with delim_whitespace=False" ) engine = "python" elif sep is not None and len(sep) > 1: @@ -975,10 +966,10 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support" - " regex separators (separators > 1 char and" - r" different from '\s+' are" - " interpreted as regex)" + "the 'c' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are " + "interpreted as regex)" ) engine = "python" elif delim_whitespace: @@ -993,9 +984,9 @@ def _clean_options(self, options, engine): encodeable = False if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = ( - "the separator encoded in {encoding}" - " is > 1 char long, and the 'c' engine" - " does not support such separators".format(encoding=encoding) + f"the separator encoded in {encoding} " + "is > 1 char long, and the 'c' engine " + "does not support such separators" ) engine = "python" @@ -1024,22 +1015,20 @@ def _clean_options(self, options, engine): if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: - msg = ( - "Falling back to the 'python' engine because" - " {reason}, but this causes {option!r} to be" - " ignored as it is not supported by the 'python'" - " engine." - ).format(reason=fallback_reason, option=arg) - raise ValueError(msg) + raise ValueError( + "Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {repr(arg)} to be " + "ignored as it is not supported by the 'python' engine." + ) del result[arg] if fallback_reason: warnings.warn( ( - "Falling back to the 'python' engine because" - " {0}; you can avoid this warning by specifying" - " engine='python'." - ).format(fallback_reason), + "Falling back to the 'python' engine because " + f"{fallback_reason}; you can avoid this warning by specifying " + "engine='python'." + ), ParserWarning, stacklevel=5, ) @@ -1050,7 +1039,7 @@ def _clean_options(self, options, engine): na_values = options["na_values"] skiprows = options["skiprows"] - _validate_header_arg(options["header"]) + validate_header_arg(options["header"]) depr_warning = "" @@ -1059,8 +1048,8 @@ def _clean_options(self, options, engine): depr_default = _deprecated_defaults[arg] msg = ( - "The '{arg}' argument has been deprecated " - "and will be removed in a future version.".format(arg=arg) + f"The {repr(arg)} argument has been deprecated and will be " + "removed in a future version." ) if result.get(arg, depr_default) != depr_default: @@ -1084,9 +1073,8 @@ def _clean_options(self, options, engine): if converters is not None: if not isinstance(converters, dict): raise TypeError( - "Type converters must be a dict or" - " subclass, input was " - "a {0!r}".format(type(converters).__name__) + "Type converters must be a dict or subclass, " + f"input was a {type(converters).__name__}" ) else: converters = {} @@ -1131,9 +1119,9 @@ def _make_engine(self, engine="c"): klass = FixedWidthFieldParser else: raise ValueError( - "Unknown engine: {engine} (valid options are" - ' "c", "python", or' - ' "python-fwf")'.format(engine=engine) + f"Unknown engine: {engine} (valid options are " + '"c", "python", or ' + '"python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1243,7 +1231,7 @@ def _validate_usecols_names(usecols, names): if len(missing) > 0: raise ValueError( "Usecols do not match columns, " - "columns expected but not found: {missing}".format(missing=missing) + f"columns expected but not found: {missing}" ) return usecols @@ -1544,11 +1532,9 @@ def _maybe_dedup_names(self, names): counts[col] = cur_count + 1 if is_potential_mi: - col = col[:-1] + ( - "{column}.{count}".format(column=col[-1], count=cur_count), - ) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) else: - col = "{column}.{count}".format(column=col, count=cur_count) + col = f"{col}.{cur_count}" cur_count = counts[col] names[i] = col @@ -1594,7 +1580,7 @@ def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): return col - raise ValueError("Index {col} invalid".format(col=col)) + raise ValueError(f"Index {col} invalid") to_remove = [] index = [] @@ -1605,7 +1591,7 @@ def ix(col): # remove index items from content and columns, don't pop in # loop - for i in reversed(sorted(to_remove)): + for i in sorted(to_remove, reverse=True): data.pop(i) if not self._implicit_index: columns.pop(i) @@ -1618,11 +1604,7 @@ def _get_name(icol): return icol if col_names is None: - raise ValueError( - ("Must supply column order to use {icol!s} as index").format( - icol=icol - ) - ) + raise ValueError(f"Must supply column order to use {icol!s} as index") for i, c in enumerate(col_names): if i == icol: @@ -1637,7 +1619,7 @@ def _get_name(icol): # remove index items from content and columns, don't pop in # loop - for c in reversed(sorted(to_remove)): + for c in sorted(to_remove, reverse=True): data.pop(c) col_names.remove(c) @@ -1698,9 +1680,9 @@ def _convert_to_ndarrays( warnings.warn( ( "Both a converter and dtype were specified " - "for column {0} - only the converter will " + f"for column {c} - only the converter will " "be used" - ).format(c), + ), ParserWarning, stacklevel=7, ) @@ -1738,10 +1720,7 @@ def _convert_to_ndarrays( and not is_categorical_dtype(cast_type) and na_count > 0 ): - raise ValueError( - "Bool column has NA values in " - "column {column}".format(column=c) - ) + raise ValueError(f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass @@ -1749,11 +1728,7 @@ def _convert_to_ndarrays( result[c] = cvals if verbose and na_count: - print( - "Filled {count} NA values in column {c!s}".format( - count=na_count, c=c - ) - ) + print(f"Filled {na_count} NA values in column {c!s}") return result def _infer_types(self, values, na_values, try_num_bool=True): @@ -1850,9 +1825,9 @@ def _cast_types(self, values, cast_type, column): return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " + f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type) + "to be used in parser methods" ) else: @@ -1860,8 +1835,7 @@ def _cast_types(self, values, cast_type, column): values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError: raise ValueError( - "Unable to convert column {column} to type " - "{cast_type}".format(column=column, cast_type=cast_type) + f"Unable to convert column {column} to type {cast_type}" ) return values @@ -1893,12 +1867,18 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): - # if source is utf-16 plain text, convert source to utf-8 + encoding = kwds.get("encoding") + + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds["encoding"]) + + # Handle the file object with universal line mode enabled. + # We will handle the newline character ourselves later on. + if isinstance(src, BufferedIOBase): + src = TextIOWrapper(src, encoding=encoding, newline="") + kwds["encoding"] = "utf-8" # #2442 @@ -1918,7 +1898,12 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.names, + self.index_names, + self.col_names, + passed_names, + ) = self._extract_multi_indexer_columns( self._reader.header, self.index_names, self.col_names, passed_names ) else: @@ -1927,8 +1912,7 @@ def __init__(self, src, **kwds): if self.names is None: if self.prefix: self.names = [ - "{prefix}{i}".format(prefix=self.prefix, i=i) - for i in range(self._reader.table_width) + f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: self.names = list(range(self._reader.table_width)) @@ -2225,8 +2209,6 @@ class PythonParser(ParserBase): def __init__(self, f, **kwds): """ Workhorse function for processing nested list into DataFrame - - Should be replaced by np.genfromtxt eventually? """ ParserBase.__init__(self, kwds) @@ -2279,7 +2261,7 @@ def __init__(self, f, **kwds): self.comment = kwds["comment"] self._comment_lines = [] - f, handles = _get_handle( + f, handles = get_handle( f, "r", encoding=self.encoding, @@ -2307,7 +2289,12 @@ def __init__(self, f, **kwds): # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( # noqa: E501 + ( + self.columns, + self.index_names, + self.col_names, + _, + ) = self._extract_multi_indexer_columns( self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. @@ -2338,15 +2325,9 @@ def __init__(self, f, **kwds): raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: - self.nonnum = re.compile( - r"[^-^0-9^{decimal}]+".format(decimal=self.decimal) - ) + self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") else: - self.nonnum = re.compile( - r"[^-^0-9^{thousands}^{decimal}]+".format( - thousands=self.thousands, decimal=self.decimal - ) - ) + self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -2422,23 +2403,13 @@ class MyDialect(csv.Dialect): self.line_pos += 1 sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter - if self.encoding is not None: - self.buf.extend( - list( - UnicodeReader( - StringIO(line), dialect=dia, encoding=self.encoding - ) - ) - ) - else: - self.buf.extend(list(csv.reader(StringIO(line), dialect=dia))) - if self.encoding is not None: - reader = UnicodeReader( - f, dialect=dia, encoding=self.encoding, strict=True - ) - else: - reader = csv.reader(f, dialect=dia, strict=True) + # Note: self.encoding is irrelevant here + line_rdr = csv.reader(StringIO(line), dialect=dia) + self.buf.extend(list(line_rdr)) + + # Note: self.encoding is irrelevant here + reader = csv.reader(f, dialect=dia, strict=True) else: @@ -2592,8 +2563,8 @@ def _infer_columns(self): except StopIteration: if self.line_pos < hr: raise ValueError( - "Passed header={hr} but only {pos} lines in " - "file".format(hr=hr, pos=(self.line_pos + 1)) + f"Passed header={hr} but only {self.line_pos + 1} lines in " + "file" ) # We have an empty file, so check @@ -2616,11 +2587,9 @@ def _infer_columns(self): for i, c in enumerate(line): if c == "": if have_mi_columns: - col_name = "Unnamed: {i}_level_{level}".format( - i=i, level=level - ) + col_name = f"Unnamed: {i}_level_{level}" else: - col_name = "Unnamed: {i}".format(i=i) + col_name = f"Unnamed: {i}" this_unnamed_cols.append(i) this_columns.append(col_name) @@ -2635,7 +2604,7 @@ def _infer_columns(self): while cur_count > 0: counts[col] = cur_count + 1 - col = "{column}.{count}".format(column=col, count=cur_count) + col = f"{col}.{cur_count}" cur_count = counts[col] this_columns[i] = col @@ -2700,12 +2669,7 @@ def _infer_columns(self): if not names: if self.prefix: - columns = [ - [ - "{prefix}{idx}".format(prefix=self.prefix, idx=i) - for i in range(ncols) - ] - ] + columns = [[f"{self.prefix}{i}" for i in range(ncols)]] else: columns = [list(range(ncols))] columns = self._handle_usecols(columns, columns[0]) @@ -2907,7 +2871,7 @@ def _alert_malformed(self, msg, row_num): if self.error_bad_lines: raise ParserError(msg) elif self.warn_bad_lines: - base = "Skipping line {row_num}: ".format(row_num=row_num) + base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") def _next_iter_line(self, row_num): @@ -3131,10 +3095,8 @@ def _rows_to_cols(self, content): for row_num, actual_len in bad_lines: msg = ( - "Expected {col_len} fields in line {line}, saw " - "{length}".format( - col_len=col_len, line=(row_num + 1), length=actual_len - ) + f"Expected {col_len} fields in line {row_num + 1}, saw " + f"{actual_len}" ) if ( self.delimiter @@ -3332,9 +3294,7 @@ def _isindex(colspec): converter, colspec, data_dict, orig_names ) if new_name in data_dict: - raise ValueError( - "New date column already in dict {name}".format(name=new_name) - ) + raise ValueError(f"New date column already in dict {new_name}") new_data[new_name] = col new_cols.append(new_name) date_cols.update(old_names) @@ -3343,9 +3303,7 @@ def _isindex(colspec): # dict of new name to column list for new_name, colspec in parse_spec.items(): if new_name in data_dict: - raise ValueError( - "Date column {name} already in dict".format(name=new_name) - ) + raise ValueError(f"Date column {new_name} already in dict") _, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3389,7 +3347,7 @@ def _clean_na_values(na_values, keep_default_na=True): if na_values is None: if keep_default_na: - na_values = _NA_VALUES + na_values = STR_NA_VALUES else: na_values = set() na_fvalues = set() @@ -3406,7 +3364,7 @@ def _clean_na_values(na_values, keep_default_na=True): v = [v] if keep_default_na: - v = set(v) | _NA_VALUES + v = set(v) | STR_NA_VALUES na_values[k] = v na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} @@ -3415,7 +3373,7 @@ def _clean_na_values(na_values, keep_default_na=True): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: - na_values = na_values | _NA_VALUES + na_values = na_values | STR_NA_VALUES na_fvalues = _floatify_na_values(na_values) @@ -3483,7 +3441,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): # 2) index_names (column names) # # Both must be non-null to ensure a successful construction. Otherwise, - # we have to create a generic emtpy Index. + # we have to create a generic empty Index. if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: @@ -3524,7 +3482,7 @@ def _stringify_na_values(na_values): # we are like 999 here if v == int(v): v = int(v) - result.append("{value}.0".format(value=v)) + result.append(f"{v}.0") result.append(str(v)) result.append(v) @@ -3566,7 +3524,7 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na): return na_values[col], na_fvalues[col] else: if keep_default_na: - return _NA_VALUES, set() + return STR_NA_VALUES, set() return set(), set() else: @@ -3584,7 +3542,7 @@ def _get_col_names(colspec, columns): return colnames -class FixedWidthReader(BaseIterator): +class FixedWidthReader(abc.Iterator): """ A reader of fixed-width lines. """ @@ -3604,7 +3562,7 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=1 if not isinstance(self.colspecs, (tuple, list)): raise TypeError( "column specifications must be a list or tuple, " - "input was a %r" % type(colspecs).__name__ + f"input was a {type(colspecs).__name__}" ) for colspec in self.colspecs: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index adf0aa9613029..e51f24b551f31 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,16 +1,20 @@ """ pickle compat """ -from io import BytesIO import pickle +from typing import Any, Optional import warnings -from numpy.lib.format import read_array - +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc -from pandas.io.common import _get_handle, _stringify_path +from pandas.io.common import get_filepath_or_buffer, get_handle -def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle( + obj: Any, + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, +): """ Pickle (serialize) object to file. @@ -18,11 +22,17 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): ---------- obj : any object Any python object. - path : str - File path where the pickled object will be stored. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be stored. + + .. versionchanged:: 1.0.0 + Accept URL. URL has to be of S3 or GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -66,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): >>> import os >>> os.remove("./dummy.pkl") """ - path = _stringify_path(path) - f, fh = _get_handle(path, "wb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression, mode="wb" + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -76,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass -def read_pickle(path, compression="infer"): +def read_pickle( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" +): """ Load pickled pandas object (or any object) from file. @@ -89,13 +110,17 @@ def read_pickle(path, compression="infer"): Parameters ---------- - path : str - File path where the pickled object will be loaded. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be loaded from. + + .. versionchanged:: 1.0.0 + Accept URL. URL is not limited to S3 and GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', - or '.zip' respectively, and no decompression otherwise. - Set to None for no decompression. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). Returns ------- @@ -137,33 +162,37 @@ def read_pickle(path, compression="infer"): >>> import os >>> os.remove("./dummy.pkl") """ - path = _stringify_path(path) - f, fh = _get_handle(path, "rb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes - # 3) try pickle_compat with latin1 encoding + + excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError) try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) - except Exception: - try: - return pc.load(f, encoding=None) - except Exception: - return pc.load(f, encoding="latin1") + except excs_to_catch: + # e.g. + # "No module named 'pandas.core.sparse.series'" + # "Can't get attribute '__nat_unpickle' on bool: """Check if a given group is a metadata group for a given parent_group.""" if group._v_depth <= parent_group._v_depth: return False @@ -465,8 +497,19 @@ class HDFStore: >>> store.close() """ + _handle: Optional["File"] + _mode: str + _complevel: int + _fletcher32: bool + def __init__( - self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs + self, + path, + mode: str = "a", + complevel: Optional[int] = None, + complib=None, + fletcher32: bool = False, + **kwargs, ): if "format" in kwargs: @@ -476,15 +519,13 @@ def __init__( if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( - "complib only supports {libs} compression.".format( - libs=tables.filters.all_complibs - ) + f"complib only supports {tables.filters.all_complibs} compression." ) if complib is None and complevel is not None: complib = tables.filters.default_complib - self._path = _stringify_path(path) + self._path = stringify_path(path) if mode is None: mode = "a" self._mode = mode @@ -508,28 +549,26 @@ def root(self): def filename(self): return self._path - def __getitem__(self, key): + def __getitem__(self, key: str): return self.get(key) - def __setitem__(self, key, value): + def __setitem__(self, key: str, value): self.put(key, value) - def __delitem__(self, key): + def __delitem__(self, key: str): return self.remove(key) - def __getattr__(self, name): + def __getattr__(self, name: str): """ allow attribute access to get stores """ try: return self.get(name) except (KeyError, ClosedFileError): pass raise AttributeError( - "'{object}' object has no attribute '{name}'".format( - object=type(self).__name__, name=name - ) + f"'{type(self).__name__}' object has no attribute '{name}'" ) - def __contains__(self, key): + def __contains__(self, key: str) -> bool: """ check for existence of this key can match the exact pathname or the pathnm w/o the leading '/' """ @@ -540,13 +579,12 @@ def __contains__(self, key): return True return False - def __len__(self): + def __len__(self) -> int: return len(self.groups()) - def __repr__(self): - return "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + def __repr__(self) -> str: + pstr = pprint_thing(self._path) + return f"{type(self)}\nFile path: {pstr}\n" def __enter__(self): return self @@ -554,7 +592,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - def keys(self): + def keys(self) -> List[str]: """ Return a list of keys corresponding to objects stored in HDFStore. @@ -577,7 +615,7 @@ def items(self): iteritems = items - def open(self, mode="a", **kwargs): + def open(self, mode: str = "a", **kwargs): """ Open the file in the specified mode @@ -598,8 +636,8 @@ def open(self, mode="a", **kwargs): # this would truncate, raise here if self.is_open: raise PossibleDataLossError( - "Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!".format(self._path, self._mode) + f"Re-opening the file [{self._path}] with mode [{self._mode}] " + "will delete the current file!" ) self._mode = mode @@ -615,40 +653,38 @@ def open(self, mode="a", **kwargs): try: self._handle = tables.open_file(self._path, self._mode, **kwargs) - except (IOError) as e: # pragma: no cover - if "can not be written" in str(e): - print("Opening {path} in read-only mode".format(path=self._path)) + except IOError as err: # pragma: no cover + if "can not be written" in str(err): + print(f"Opening {self._path} in read-only mode") self._handle = tables.open_file(self._path, "r", **kwargs) else: raise - except (ValueError) as e: + except ValueError as err: # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message - if "FILE_OPEN_POLICY" in str(e): - e = ValueError( - "PyTables [{version}] no longer supports opening multiple " - "files\n" + if "FILE_OPEN_POLICY" in str(err): + hdf_version = tables.get_hdf5_version() + err = ValueError( + f"PyTables [{tables.__version__}] no longer supports " + "opening multiple files\n" "even in read-only mode on this HDF5 version " - "[{hdf_version}]. You can accept this\n" + f"[{hdf_version}]. You can accept this\n" "and not open the same file multiple times at once,\n" "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " "which allows\n" - "files to be opened multiple times at once\n".format( - version=tables.__version__, - hdf_version=tables.get_hdf5_version(), - ) + "files to be opened multiple times at once\n" ) - raise e + raise err - except (Exception) as e: + except Exception as err: # trying to read from a non-existent file causes an error which # is not part of IOError, make it one - if self._mode == "r" and "Unable to open/create file" in str(e): - raise IOError(str(e)) + if self._mode == "r" and "Unable to open/create file" in str(err): + raise IOError(str(err)) raise def close(self): @@ -660,7 +696,7 @@ def close(self): self._handle = None @property - def is_open(self): + def is_open(self) -> bool: """ return a boolean indicating whether the file is open """ @@ -668,7 +704,7 @@ def is_open(self): return False return bool(self._handle.isopen) - def flush(self, fsync=False): + def flush(self, fsync: bool = False): """ Force all buffered modifications to be written to disk. @@ -692,13 +728,13 @@ def flush(self, fsync=False): except OSError: pass - def get(self, key): + def get(self, key: str): """ Retrieve pandas object stored in file. Parameters ---------- - key : object + key : str Returns ------- @@ -707,27 +743,26 @@ def get(self, key): """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") return self._read_group(group) def select( self, - key, + key: str, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, ): """ Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- - key : object + key : str Object being retrieved from file. where : list, default None List of Term (or convertible) objects, optional. @@ -751,7 +786,7 @@ def select( """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") # create the storer and axes where = _ensure_term(where, scope_level=1) @@ -778,31 +813,47 @@ def func(_start, _stop, _where): return it.get_result() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, + key: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """ return the selection as an Index Parameters ---------- - key : object + key : str where : list of Term (or convertible) objects, optional start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ where = _ensure_term(where, scope_level=1) - return self.get_storer(key).read_coordinates( - where=where, start=start, stop=stop, **kwargs - ) + tbl = self.get_storer(key) + if not isinstance(tbl, Table): + raise TypeError("can only read_coordinates with a table") + return tbl.read_coordinates(where=where, start=start, stop=stop) - def select_column(self, key, column, **kwargs): + def select_column( + self, + key: str, + column: str, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """ return a single column from the table. This is generally only useful to select an indexable Parameters ---------- - key : object - column: the column of interest + key : str + column : str + The column of interest. + start : int or None, default None + stop : int or None, default None Raises ------ @@ -812,7 +863,10 @@ def select_column(self, key, column, **kwargs): is part of a data block) """ - return self.get_storer(key).read_column(column=column, **kwargs) + tbl = self.get_storer(key) + if not isinstance(tbl, Table): + raise TypeError("can only read_column with a table") + return tbl.read_column(column=column, start=start, stop=stop) def select_as_multiple( self, @@ -824,10 +878,10 @@ def select_as_multiple( stop=None, iterator=False, chunksize=None, - auto_close=False, - **kwargs + auto_close: bool = False, ): - """ Retrieve pandas objects from multiple tables + """ + Retrieve pandas objects from multiple tables. Parameters ---------- @@ -839,6 +893,8 @@ def select_as_multiple( stop : integer (defaults to None), row number to stop selection iterator : boolean, return an iterator, default False chunksize : nrows to include in iteration, return an iterator + auto_close : bool, default False + Should automatically close the store when finished. Raises ------ @@ -860,7 +916,7 @@ def select_as_multiple( stop=stop, iterator=iterator, chunksize=chunksize, - **kwargs + auto_close=auto_close, ) if not isinstance(keys, (list, tuple)): @@ -880,11 +936,11 @@ def select_as_multiple( nrows = None for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): if t is None: - raise KeyError("Invalid table [{key}]".format(key=k)) + raise KeyError(f"Invalid table [{k}]") if not t.is_table: raise TypeError( - "object [{obj}] is not a table, and cannot be used in all " - "select as multiple".format(obj=t.pathname) + f"object [{t.pathname}] is not a table, and cannot be used in all " + "select as multiple" ) if nrows is None: @@ -892,17 +948,19 @@ def select_as_multiple( elif t.nrows != nrows: raise ValueError("all tables must have exactly the same nrows!") + # The isinstance checks here are redundant with the check above, + # but necessary for mypy; see GH#29757 + _tbls = [x for x in tbls if isinstance(x, Table)] + # axis is the concentration axes - axis = list({t.non_index_axes[0][0] for t in tbls})[0] + axis = list({t.non_index_axes[0][0] for t in _tbls})[0] def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here objs = [ - t.read( - where=_where, columns=columns, start=_start, stop=_stop, **kwargs - ) + t.read(where=_where, columns=columns, start=_start, stop=_stop) for t in tbls ] @@ -925,13 +983,27 @@ def func(_start, _stop, _where): return it.get_result(coordinates=True) - def put(self, key, value, format=None, append=False, **kwargs): + def put( + self, + key: str, + value: FrameOrSeries, + format=None, + index=True, + append=False, + complib=None, + complevel: Optional[int] = None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + data_columns: Optional[List[str]] = None, + encoding=None, + errors: str = "strict", + ): """ Store object in HDFStore. Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format @@ -946,7 +1018,7 @@ def put(self, key, value, format=None, append=False, **kwargs): data_columns : list, default None List of columns to create as data columns, or True to use all columns. See `here - `__. + `__. encoding : str, default None Provide an encoding for strings. dropna : bool, default False, do not write an ALL nan row to @@ -954,10 +1026,23 @@ def put(self, key, value, format=None, append=False, **kwargs): """ if format is None: format = get_option("io.hdf.default_format") or "fixed" - kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, **kwargs) + format = self._validate_format(format) + self._write_to_group( + key, + value, + format=format, + index=index, + append=append, + complib=complib, + complevel=complevel, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + data_columns=data_columns, + encoding=encoding, + errors=errors, + ) - def remove(self, key, where=None, start=None, stop=None): + def remove(self, key: str, where=None, start=None, stop=None): """ Remove pandas object partially by specifying the where condition @@ -984,7 +1069,12 @@ def remove(self, key, where=None, start=None, stop=None): except KeyError: # the key is not a valid store, re-raising KeyError raise + except AssertionError: + # surface any assertion errors for e.g. debugging + raise except Exception: + # In tests we get here with ClosedFileError, TypeError, and + # _table_mod.NoSuchNodeError. TODO: Catch only these? if where is not None: raise ValueError( @@ -992,9 +1082,9 @@ def remove(self, key, where=None, start=None, stop=None): ) # we are actually trying to remove a node (with children) - s = self.get_node(key) - if s is not None: - s._f_remove(recursive=True) + node = self.get_node(key) + if node is not None: + node._f_remove(recursive=True) return None # remove the node @@ -1010,7 +1100,24 @@ def remove(self, key, where=None, start=None, stop=None): return s.delete(where=where, start=start, stop=stop) def append( - self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs + self, + key: str, + value: FrameOrSeries, + format=None, + axes=None, + index=True, + append=True, + complib=None, + complevel: Optional[int] = None, + columns=None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + nan_rep=None, + chunksize=None, + expectedrows=None, + dropna: Optional[bool] = None, + data_columns: Optional[List[str]] = None, + encoding=None, + errors: str = "strict", ): """ Append to Table in file. Node must already exist and be Table @@ -1018,26 +1125,26 @@ def append( Parameters ---------- - key : object + key : str value : {Series, DataFrame} format : 'table' is the default table(t) : table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data + / selecting subsets of the data. append : bool, default True Append the input data to the existing. data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here - `__. + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan representation chunksize : size to chunk the writing expectedrows : expected TOTAL row size of this table encoding : default None, provide an encoding for strings - dropna : bool, default False + dropna : bool, default False Do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table'. @@ -1055,11 +1162,35 @@ def append( dropna = get_option("io.hdf.dropna_table") if format is None: format = get_option("io.hdf.default_format") or "table" - kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) + format = self._validate_format(format) + self._write_to_group( + key, + value, + format=format, + axes=axes, + index=index, + append=append, + complib=complib, + complevel=complevel, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + chunksize=chunksize, + expectedrows=expectedrows, + dropna=dropna, + data_columns=data_columns, + encoding=encoding, + errors=errors, + ) def append_to_multiple( - self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs + self, + d: Dict, + value, + selector, + data_columns=None, + axes=None, + dropna=False, + **kwargs, ): """ Append to multiple tables @@ -1084,9 +1215,8 @@ def append_to_multiple( """ if axes is not None: raise TypeError( - "axes is currently not accepted as a parameter to" - " append_to_multiple; you can create the " - "tables independently instead" + "axes is currently not accepted as a parameter to append_to_multiple; " + "you can create the tables independently instead" ) if not isinstance(d, dict): @@ -1105,7 +1235,7 @@ def append_to_multiple( # figure out how to split the value remain_key = None - remain_values = [] + remain_values: List = [] for k, v in d.items(): if v is None: if remain_key is not None: @@ -1143,16 +1273,35 @@ def append_to_multiple( self.append(k, val, data_columns=dc, **kwargs) - def create_table_index(self, key, **kwargs): - """ Create a pytables index on the table + def create_table_index( + self, + key: str, + columns=None, + optlevel: Optional[int] = None, + kind: Optional[str] = None, + ): + """ + Create a pytables index on the table. + Parameters ---------- - key : object (the node to index) + key : str + columns : None, bool, or listlike[str] + Indicate which columns to create an index on. + + * False : Do not create any indexes. + * True : Create indexes on all columns. + * None : Create indexes on all columns. + * listlike : Create indexes on the given columns. + + optlevel : int or None, default None + Optimization level, if None, pytables defaults to 6. + kind : str or None, default None + Kind of index, if None, pytables defaults to "medium". Raises ------ - raises if the node is not a table - + TypeError: raises if the node is not a table """ # version requirements @@ -1161,9 +1310,9 @@ def create_table_index(self, key, **kwargs): if s is None: return - if not s.is_table: + if not isinstance(s, Table): raise TypeError("cannot create table index on a Fixed format store") - s.create_index(**kwargs) + s.create_index(columns=columns, optlevel=optlevel, kind=kind) def groups(self): """ @@ -1238,21 +1387,27 @@ def walk(self, where="/"): yield (g._v_pathname.rstrip("/"), groups, leaves) - def get_node(self, key): + def get_node(self, key: str) -> Optional["Node"]: """ return the node with the key or None if it does not exist """ self._check_if_open() + if not key.startswith("/"): + key = "/" + key + + assert self._handle is not None + assert _table_mod is not None # for mypy try: - if not key.startswith("/"): - key = "/" + key - return self._handle.get_node(self.root, key) + node = self._handle.get_node(self.root, key) except _table_mod.exceptions.NoSuchNodeError: return None - def get_storer(self, key): + assert isinstance(node, _table_mod.Node), type(node) + return node + + def get_storer(self, key: str) -> Union["GenericFixed", "Table"]: """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - raise KeyError("No object named {key} in the file".format(key=key)) + raise KeyError(f"No object named {key} in the file") s = self._create_storer(group) s.infer_axes() @@ -1262,27 +1417,28 @@ def copy( self, file, mode="w", - propindexes=True, + propindexes: bool = True, keys=None, complib=None, - complevel=None, - fletcher32=False, + complevel: Optional[int] = None, + fletcher32: bool = False, overwrite=True, ): - """ copy the existing store to a new file, upgrading in place - - Parameters - ---------- - propindexes: restore indexes in copied file (defaults to True) - keys : list of keys to include in the copy (defaults to all) - overwrite : overwrite (remove and replace) existing nodes in the - new store (default is True) - mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + """ + Copy the existing store to a new file, updating in place. - Returns - ------- - open file handle of the new store + Parameters + ---------- + propindexes: bool, default True + Restore indexes in copied file. + keys : list of keys to include in the copy (defaults to all) + overwrite : overwrite (remove and replace) existing nodes in the + new store (default is True) + mode, complib, complevel, fletcher32 same as in HDFStore.__init__ + Returns + ------- + open file handle of the new store """ new_store = HDFStore( file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 @@ -1300,9 +1456,9 @@ def copy( new_store.remove(k) data = self.select(k) - if s.is_table: + if isinstance(s, Table): - index = False + index: Union[bool, List[str]] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -1317,7 +1473,7 @@ def copy( return new_store - def info(self): + def info(self) -> str: """ Print detailed information on the store. @@ -1327,11 +1483,11 @@ def info(self): ------- str """ - output = "{type}\nFile path: {path}\n".format( - type=type(self), path=pprint_thing(self._path) - ) + path = pprint_thing(self._path) + output = f"{type(self)}\nFile path: {path}\n" + if self.is_open: - lkeys = sorted(list(self.keys())) + lkeys = sorted(self.keys()) if len(lkeys): keys = [] values = [] @@ -1342,13 +1498,13 @@ def info(self): if s is not None: keys.append(pprint_thing(s.pathname or k)) values.append(pprint_thing(s or "invalid_HDFStore node")) + except AssertionError: + # surface any assertion errors for e.g. debugging + raise except Exception as detail: keys.append(k) - values.append( - "[invalid_HDFStore node: {detail}]".format( - detail=pprint_thing(detail) - ) - ) + dstr = pprint_thing(detail) + values.append(f"[invalid_HDFStore node: {dstr}]") output += adjoin(12, keys, values) else: @@ -1358,38 +1514,44 @@ def info(self): return output - # private methods ###### + # ------------------------------------------------------------------------ + # private methods + def _check_if_open(self): if not self.is_open: - raise ClosedFileError("{0} file is not open!".format(self._path)) + raise ClosedFileError(f"{self._path} file is not open!") - def _validate_format(self, format, kwargs): - """ validate / deprecate formats; return the new kwargs """ - kwargs = kwargs.copy() + def _validate_format(self, format: str) -> str: + """ validate / deprecate formats """ # validate try: - kwargs["format"] = _FORMAT_MAP[format.lower()] + format = _FORMAT_MAP[format.lower()] except KeyError: - raise TypeError("invalid HDFStore format specified [{0}]".format(format)) + raise TypeError(f"invalid HDFStore format specified [{format}]") - return kwargs + return format - def _create_storer(self, group, format=None, value=None, append=False, **kwargs): + def _create_storer( + self, + group, + format=None, + value: Optional[FrameOrSeries] = None, + encoding: str = "UTF-8", + errors: str = "strict", + ) -> Union["GenericFixed", "Table"]: """ return a suitable class to operate """ + cls: Union[Type["GenericFixed"], Type["Table"]] + + if value is not None and not isinstance(value, (Series, DataFrame)): + raise TypeError("value must be None, Series, or DataFrame") + def error(t): - raise TypeError( - "cannot properly create the storer for: [{t}] [group->" - "{group},value->{value},format->{format},append->{append}," - "kwargs->{kwargs}]".format( - t=t, - group=group, - value=type(value), - format=format, - append=append, - kwargs=kwargs, - ) + # return instead of raising so mypy can tell where we are raising + return TypeError( + f"cannot properly create the storer for: [{t}] [group->" + f"{group},value->{type(value)},format->{format}" ) pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) @@ -1400,6 +1562,7 @@ def error(t): if value is None: _tables() + assert _table_mod is not None # for mypy if getattr(group, "table", None) or isinstance( group, _table_mod.table.Table ): @@ -1411,11 +1574,8 @@ def error(t): "nor a value are passed" ) else: - - try: - pt = _TYPE_MAP[type(value)] - except KeyError: - error("_TYPE_MAP") + _TYPE_MAP = {Series: "series", DataFrame: "frame"} + pt = _TYPE_MAP[type(value)] # we are actually a table if format == "table": @@ -1423,10 +1583,12 @@ def error(t): # a storer node if "table" not in pt: + _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} try: - return globals()[_STORER_MAP[pt]](self, group, **kwargs) + cls = _STORER_MAP[pt] except KeyError: - error("_STORER_MAP") + raise error("_STORER_MAP") + return cls(self, group, encoding=encoding, errors=errors) # existing node (and must be a table) if tt is None: @@ -1448,46 +1610,54 @@ def error(t): tt = "appendable_frame" elif index.nlevels > 1: tt = "appendable_multiframe" - elif pt == "wide_table": - tt = "appendable_panel" - elif pt == "ndim_table": - tt = "appendable_ndim" - - else: - - # distinguish between a frame/table - tt = "legacy_panel" - try: - fields = group.table._v_attrs.fields - if len(fields) == 1 and fields[0] == "value": - tt = "legacy_frame" - except IndexError: - pass + _TABLE_MAP = { + "generic_table": GenericTable, + "appendable_series": AppendableSeriesTable, + "appendable_multiseries": AppendableMultiSeriesTable, + "appendable_frame": AppendableFrameTable, + "appendable_multiframe": AppendableMultiFrameTable, + "worm": WORMTable, + } try: - return globals()[_TABLE_MAP[tt]](self, group, **kwargs) + cls = _TABLE_MAP[tt] except KeyError: - error("_TABLE_MAP") + raise error("_TABLE_MAP") + + return cls(self, group, encoding=encoding, errors=errors) def _write_to_group( self, - key, - value, + key: str, + value: FrameOrSeries, format, + axes=None, index=True, append=False, complib=None, + complevel: Optional[int] = None, + fletcher32=None, + min_itemsize: Optional[Union[int, Dict[str, int]]] = None, + chunksize=None, + expectedrows=None, + dropna=False, + nan_rep=None, + data_columns=None, encoding=None, - **kwargs + errors: str = "strict", ): group = self.get_node(key) + # we make this assertion for mypy; the get_node call will already + # have raised if this is incorrect + assert self._handle is not None + # remove the node if we are not appending if group is not None and not append: self._handle.remove_node(group, recursive=True) group = None - # we don't want to store a table node at all if are object is 0-len + # we don't want to store a table node at all if our object is 0-len # as there are not dtypes if getattr(value, "empty", None) and (format == "table" or append): return @@ -1509,9 +1679,7 @@ def _write_to_group( group = self._handle.create_group(path, p) path = new_path - s = self._create_storer( - group, format, value, append=append, encoding=encoding, **kwargs - ) + s = self._create_storer(group, format, value, encoding=encoding, errors=errors) if append: # raise if we are trying to append to a Fixed format, # or a table that exists (and we are putting) @@ -1526,49 +1694,66 @@ def _write_to_group( raise ValueError("Compression not supported on Fixed format stores") # write the object - s.write(obj=value, append=append, complib=complib, **kwargs) + s.write( + obj=value, + axes=axes, + append=append, + complib=complib, + complevel=complevel, + fletcher32=fletcher32, + min_itemsize=min_itemsize, + chunksize=chunksize, + expectedrows=expectedrows, + dropna=dropna, + nan_rep=nan_rep, + data_columns=data_columns, + ) - if s.is_table and index: + if isinstance(s, Table) and index: s.create_index(columns=index) - def _read_group(self, group, **kwargs): + def _read_group(self, group: "Node"): s = self._create_storer(group) s.infer_axes() - return s.read(**kwargs) + return s.read() class TableIterator: - """ define the iteration interface on a table + """ + Define the iteration interface on a table - Parameters - ---------- + Parameters + ---------- + store : HDFStore + s : the referred storer + func : the function to execute the query + where : the where of the query + nrows : the rows to iterate on + start : the passed start value (default is None) + stop : the passed stop value (default is None) + iterator : bool, default False + Whether to use the default iterator. + chunksize : the passed chunking value (default is 100000) + auto_close : bool, default False + Whether to automatically close the store at the end of iteration. + """ - store : the reference store - s : the referred storer - func : the function to execute the query - where : the where of the query - nrows : the rows to iterate on - start : the passed start value (default is None) - stop : the passed stop value (default is None) - iterator : boolean, whether to use the default iterator - chunksize : the passed chunking value (default is 50000) - auto_close : boolean, automatically close the store at the end of - iteration, default is False - kwargs : the passed kwargs - """ + chunksize: Optional[int] + store: HDFStore + s: Union["GenericFixed", "Table"] def __init__( self, - store, - s, + store: HDFStore, + s: Union["GenericFixed", "Table"], func, where, nrows, start=None, stop=None, - iterator=False, - chunksize=None, - auto_close=False, + iterator: bool = False, + chunksize: Optional[int] = None, + auto_close: bool = False, ): self.store = store self.s = s @@ -1619,11 +1804,11 @@ def close(self): if self.auto_close: self.store.close() - def get_result(self, coordinates=False): + def get_result(self, coordinates: bool = False): # return the actual iterator if self.chunksize is not None: - if not self.s.is_table: + if not isinstance(self.s, Table): raise TypeError("can only use an iterator or chunksize on a table") self.coordinates = self.s.read_coordinates(where=self.where) @@ -1632,6 +1817,8 @@ def get_result(self, coordinates=False): # if specified read via coordinates (necessary for multiple selections if coordinates: + if not isinstance(self.s, Table): + raise TypeError("can only read_coordinates on a table") where = self.s.read_coordinates( where=self.where, start=self.start, stop=self.stop ) @@ -1662,144 +1849,134 @@ class IndexCol: is_data_indexable = True _info_fields = ["freq", "tz", "index_name"] + name: str + cname: str + def __init__( self, + name: str, values=None, kind=None, typ=None, - cname=None, - itemsize=None, - name=None, + cname: Optional[str] = None, axis=None, - kind_attr=None, pos=None, freq=None, tz=None, index_name=None, - **kwargs + ordered=None, + table=None, + meta=None, + metadata=None, ): + + if not isinstance(name, str): + raise ValueError("`name` must be a str.") + self.values = values self.kind = kind self.typ = typ - self.itemsize = itemsize self.name = name - self.cname = cname - self.kind_attr = kind_attr + self.cname = cname or name self.axis = axis self.pos = pos self.freq = freq self.tz = tz self.index_name = index_name - self.table = None - self.meta = None - self.metadata = None + self.ordered = ordered + self.table = table + self.meta = meta + self.metadata = metadata - if name is not None: - self.set_name(name, kind_attr) if pos is not None: self.set_pos(pos) - def set_name(self, name, kind_attr=None): - """ set the name of this indexer """ - self.name = name - self.kind_attr = kind_attr or "{name}_kind".format(name=name) - if self.cname is None: - self.cname = name - - return self + # These are ensured as long as the passed arguments match the + # constructor annotations. + assert isinstance(self.name, str) + assert isinstance(self.cname, str) - def set_axis(self, axis): - """ set the axis over which I index """ - self.axis = axis + @property + def itemsize(self) -> int: + # Assumes self.typ has already been initialized + return self.typ.itemsize - return self + @property + def kind_attr(self) -> str: + return f"{self.name}_kind" - def set_pos(self, pos): + def set_pos(self, pos: int): """ set the position of this column in the Table """ self.pos = pos if pos is not None and self.typ is not None: self.typ._v_pos = pos - return self - - def set_table(self, table): - self.table = table - return self - def __repr__(self): + def __repr__(self) -> str: temp = tuple( map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) ) ) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: """ compare 2 col items """ return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "axis", "pos"] ) - def __ne__(self, other): + def __ne__(self, other) -> bool: return not self.__eq__(other) @property - def is_indexed(self): + def is_indexed(self) -> bool: """ return whether I am an indexed column """ - try: - return getattr(self.table.cols, self.cname).is_indexed - except AttributeError: - False - - def copy(self): - new_self = copy.copy(self) - return new_self - - def infer(self, handler): - """infer this column from the table: create and return a new object""" - table = handler.table - new_self = self.copy() - new_self.set_table(table) - new_self.get_attr() - new_self.read_metadata(handler) - return new_self + if not hasattr(self.table, "cols"): + # e.g. if infer hasn't been called yet, self.table will be None. + return False + # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute + # 'error: "None" has no attribute "cols"' + return getattr(self.table.cols, self.cname).is_indexed # type: ignore - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): - """ set the values from this selection: take = take ownership """ + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. + """ + assert isinstance(values, np.ndarray), type(values) # values is a recarray if values.dtype.fields is not None: values = values[self.cname] - values = _maybe_convert(values, self.kind, encoding, errors) + val_kind = _ensure_decoded(self.kind) + values = _maybe_convert(values, val_kind, encoding, errors) kwargs = dict() + kwargs["name"] = _ensure_decoded(self.index_name) + if self.freq is not None: kwargs["freq"] = _ensure_decoded(self.freq) - if self.index_name is not None: - kwargs["name"] = _ensure_decoded(self.index_name) + # making an Index instance could throw a number of different errors try: - self.values = Index(values, **kwargs) - except Exception: - + new_pd_index = Index(values, **kwargs) + except ValueError: # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') if "freq" in kwargs: kwargs["freq"] = None - self.values = Index(values, **kwargs) - - self.values = _set_tz(self.values, self.tz) + new_pd_index = Index(values, **kwargs) - return self + new_pd_index = _set_tz(new_pd_index, self.tz) + return new_pd_index, new_pd_index def take_data(self): - """ return the values & release the memory """ - self.values, values = None, self.values - return values + """ return the values""" + return self.values @property def attrs(self): @@ -1834,14 +2011,11 @@ def maybe_set_size(self, min_itemsize=None): if min_itemsize is not None and self.typ.itemsize < min_itemsize: self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) - def validate(self, handler, append): - self.validate_names() - def validate_names(self): pass - def validate_and_set(self, handler, append): - self.set_table(handler.table) + def validate_and_set(self, handler: "AppendableTable", append: bool): + self.table = handler.table self.validate_col() self.validate_attr(append) self.validate_metadata(handler) @@ -1859,25 +2033,22 @@ def validate_col(self, itemsize=None): itemsize = self.itemsize if c.itemsize < itemsize: raise ValueError( - "Trying to store a string with len [{itemsize}] in " - "[{cname}] column but\nthis column has a limit of " - "[{c_itemsize}]!\nConsider using min_itemsize to " - "preset the sizes on these columns".format( - itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize - ) + f"Trying to store a string with len [{itemsize}] in " + f"[{self.cname}] column but\nthis column has a limit of " + f"[{c.itemsize}]!\nConsider using min_itemsize to " + "preset the sizes on these columns" ) return c.itemsize return None - def validate_attr(self, append): + def validate_attr(self, append: bool): # check for backwards incompatibility if append: existing_kind = getattr(self.attrs, self.kind_attr, None) if existing_kind is not None and existing_kind != self.kind: raise TypeError( - "incompatible kind in col [{existing} - " - "{self_kind}]".format(existing=existing_kind, self_kind=self.kind) + f"incompatible kind in col [{existing_kind} - {self.kind}]" ) def update_info(self, info): @@ -1887,7 +2058,7 @@ def update_info(self, info): for key in self._info_fields: value = getattr(self, key, None) - idx = _get_info(info, self.name) + idx = info.setdefault(self.name, {}) existing_value = idx.get(key) if key in idx and value is not None and existing_value != value: @@ -1903,40 +2074,25 @@ def update_info(self, info): else: raise ValueError( - "invalid info for [{name}] for [{key}], " - "existing_value [{existing_value}] conflicts with " - "new value [{value}]".format( - name=self.name, - key=key, - existing_value=existing_value, - value=value, - ) + f"invalid info for [{self.name}] for [{key}], " + f"existing_value [{existing_value}] conflicts with " + f"new value [{value}]" ) else: if value is not None or existing_value is not None: idx[key] = value - return self - def set_info(self, info): """ set my state from the passed info """ idx = info.get(self.name) if idx is not None: self.__dict__.update(idx) - def get_attr(self): - """ set the kind for this column """ - self.kind = getattr(self.attrs, self.kind_attr, None) - def set_attr(self): """ set the kind for this column """ setattr(self.attrs, self.kind_attr, self.kind) - def read_metadata(self, handler): - """ retrieve the metadata for this columns """ - self.metadata = handler.read_metadata(self.cname) - - def validate_metadata(self, handler): + def validate_metadata(self, handler: "AppendableTable"): """ validate that kind=category does not change the categories """ if self.meta == "category": new_metadata = self.metadata @@ -1951,7 +2107,7 @@ def validate_metadata(self, handler): "different categories to the existing" ) - def write_metadata(self, handler): + def write_metadata(self, handler: "AppendableTable"): """ set the meta data """ if self.metadata is not None: handler.write_metadata(self.cname, self.metadata) @@ -1961,34 +2117,24 @@ class GenericIndexCol(IndexCol): """ an index which is not represented in the data of the table """ @property - def is_indexed(self): + def is_indexed(self) -> bool: return False - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): - """ set the values from this selection: take = take ownership + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. Parameters ---------- - values : np.ndarray nan_rep : str encoding : str errors : str - start : int, optional - Table row number: the start of the sub-selection. - stop : int, optional - Table row number: the end of the sub-selection. Values larger than - the underlying table's row count are normalized to that. """ + assert isinstance(values, np.ndarray), type(values) - start = start if start is not None else 0 - stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows - self.values = Int64Index(np.arange(stop - start)) - - return self - - def get_attr(self): - pass + values = Int64Index(np.arange(len(values))) + return values, values def set_attr(self): pass @@ -2011,48 +2157,47 @@ class DataCol(IndexCol): is_data_indexable = False _info_fields = ["tz", "ordered"] - @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): - """ return a new datacol with the block i """ - - if cname is None: - cname = name or "values_block_{idx}".format(idx=i) - if name is None: - name = cname - - # prior to 0.10.1, we named values blocks like: values_block_0 an the - # name values_0 - try: - if version[0] == 0 and version[1] <= 10 and version[2] == 0: - m = re.search(r"values_block_(\d+)", name) - if m: - name = "values_{group}".format(group=m.groups()[0]) - except IndexError: - pass - - return cls(name=name, cname=cname, **kwargs) - def __init__( self, + name: str, values=None, kind=None, typ=None, cname=None, - data=None, + pos=None, + tz=None, + ordered=None, + table=None, meta=None, metadata=None, - block=None, - **kwargs + dtype=None, + data=None, ): - super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) - self.dtype = None - self.dtype_attr = "{name}_dtype".format(name=self.name) - self.meta = meta - self.meta_attr = "{name}_meta".format(name=self.name) - self.set_data(data) - self.set_metadata(metadata) + super().__init__( + name=name, + values=values, + kind=kind, + typ=typ, + pos=pos, + cname=cname, + tz=tz, + ordered=ordered, + table=table, + meta=meta, + metadata=metadata, + ) + self.dtype = dtype + self.data = data + + @property + def dtype_attr(self) -> str: + return f"{self.name}_dtype" + + @property + def meta_attr(self) -> str: + return f"{self.name}_meta" - def __repr__(self): + def __repr__(self) -> str: temp = tuple( map( pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) @@ -2060,270 +2205,94 @@ def __repr__(self): ) return ",".join( ( - "{key}->{value}".format(key=key, value=value) + f"{key}->{value}" for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) ) ) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: """ compare 2 col items """ return all( getattr(self, a, None) == getattr(other, a, None) for a in ["name", "cname", "dtype", "pos"] ) - def set_data(self, data, dtype=None): + def set_data(self, data: Union[np.ndarray, ABCExtensionArray]): + assert data is not None + assert self.dtype is None + + data, dtype_name = _get_data_and_dtype_name(data) + self.data = data - if data is not None: - if dtype is not None: - self.dtype = dtype - self.set_kind() - elif self.dtype is None: - self.dtype = data.dtype.name - self.set_kind() + self.dtype = dtype_name + self.kind = _dtype_to_kind(dtype_name) def take_data(self): - """ return the data & release the memory """ - self.data, data = None, self.data - return data - - def set_metadata(self, metadata): - """ record the metadata """ - if metadata is not None: - metadata = np.array(metadata, copy=False).ravel() - self.metadata = metadata - - def set_kind(self): - # set my kind if we can - - if self.dtype is not None: - dtype = _ensure_decoded(self.dtype) - - if dtype.startswith("string") or dtype.startswith("bytes"): - self.kind = "string" - elif dtype.startswith("float"): - self.kind = "float" - elif dtype.startswith("complex"): - self.kind = "complex" - elif dtype.startswith("int") or dtype.startswith("uint"): - self.kind = "integer" - elif dtype.startswith("date"): - self.kind = "datetime" - elif dtype.startswith("timedelta"): - self.kind = "timedelta" - elif dtype.startswith("bool"): - self.kind = "bool" - else: - raise AssertionError( - "cannot interpret dtype of [{dtype}] in [{obj}]".format( - dtype=dtype, obj=self - ) - ) + """ return the data """ + return self.data - # set my typ if we need - if self.typ is None: - self.typ = getattr(self.description, self.cname, None) + @classmethod + def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col": + """ + Get an appropriately typed and shaped pytables.Col object for values. + """ - def set_atom( - self, - block, - block_items, - existing_col, - min_itemsize, - nan_rep, - info, - encoding=None, - errors="strict", - ): - """ create and setup my atom from the block b """ - - self.values = list(block_items) - - # short-cut certain block types - if block.is_categorical: - return self.set_atom_categorical(block, items=block_items, info=info) - elif block.is_datetimetz: - return self.set_atom_datetime64tz(block, info=info) - elif block.is_datetime: - return self.set_atom_datetime64(block) - elif block.is_timedelta: - return self.set_atom_timedelta64(block) - elif block.is_complex: - return self.set_atom_complex(block) - - dtype = block.dtype.name - inferred_type = lib.infer_dtype(block.values, skipna=False) - - if inferred_type == "date": - raise TypeError("[date] is not implemented as a table column") - elif inferred_type == "datetime": - # after 8260 - # this only would be hit for a mutli-timezone dtype - # which is an error + dtype = values.dtype + itemsize = dtype.itemsize - raise TypeError( - "too many timezones in this block, create separate data columns" - ) - elif inferred_type == "unicode": - raise TypeError("[unicode] is not implemented as a table column") - - # this is basically a catchall; if say a datetime64 has nans then will - # end up here ### - elif inferred_type == "string" or dtype == "object": - self.set_atom_string( - block, - block_items, - existing_col, - min_itemsize, - nan_rep, - encoding, - errors, - ) + shape = values.shape + if values.ndim == 1: + # EA, use block shape pretending it is 2D + shape = (1, values.size) - # set as a data block - else: - self.set_atom_data(block) + if is_categorical_dtype(dtype): + codes = values.codes + atom = cls.get_atom_data(shape, kind=codes.dtype.name) + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + atom = cls.get_atom_datetime64(shape) + elif is_timedelta64_dtype(dtype): + atom = cls.get_atom_timedelta64(shape) + elif is_complex_dtype(dtype): + atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) - def get_atom_string(self, block, itemsize): - return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) + elif is_string_dtype(dtype): + atom = cls.get_atom_string(shape, itemsize) - def set_atom_string( - self, block, block_items, existing_col, min_itemsize, nan_rep, encoding, errors - ): - # fill nan items with myself, don't disturb the blocks by - # trying to downcast - block = block.fillna(nan_rep, downcast=False) - if isinstance(block, list): - block = block[0] - data = block.values - - # see if we have a valid string type - inferred_type = lib.infer_dtype(data.ravel(), skipna=False) - if inferred_type != "string": - - # we cannot serialize this data, so report an exception on a column - # by column basis - for i, item in enumerate(block_items): - - col = block.iget(i) - inferred_type = lib.infer_dtype(col.ravel(), skipna=False) - if inferred_type != "string": - raise TypeError( - "Cannot serialize the column [{item}] because\n" - "its data contents are [{type}] object dtype".format( - item=item, type=inferred_type - ) - ) + else: + atom = cls.get_atom_data(shape, kind=dtype.name) - # itemsize is the maximum length of a string (along any dimension) - data_converted = _convert_string_array(data, encoding, errors) - itemsize = data_converted.itemsize + return atom - # specified min_itemsize? - if isinstance(min_itemsize, dict): - min_itemsize = int( - min_itemsize.get(self.name) or min_itemsize.get("values") or 0 - ) - itemsize = max(min_itemsize or 0, itemsize) - - # check for column in the values conflicts - if existing_col is not None: - eci = existing_col.validate_col(itemsize) - if eci > itemsize: - itemsize = eci - - self.itemsize = itemsize - self.kind = "string" - self.typ = self.get_atom_string(block, itemsize) - self.set_data( - data_converted.astype("|S{size}".format(size=itemsize), copy=False) - ) + @classmethod + def get_atom_string(cls, shape, itemsize): + return _tables().StringCol(itemsize=itemsize, shape=shape[0]) - def get_atom_coltype(self, kind=None): + @classmethod + def get_atom_coltype(cls, kind: str) -> Type["Col"]: """ return the PyTables column class for this column """ - if kind is None: - kind = self.kind - if self.kind.startswith("uint"): - col_name = "UInt{name}Col".format(name=kind[4:]) + if kind.startswith("uint"): + k4 = kind[4:] + col_name = f"UInt{k4}Col" + elif kind.startswith("period"): + # we store as integer + col_name = "Int64Col" else: - col_name = "{name}Col".format(name=kind.capitalize()) + kcap = kind.capitalize() + col_name = f"{kcap}Col" return getattr(_tables(), col_name) - def get_atom_data(self, block, kind=None): - return self.get_atom_coltype(kind=kind)(shape=block.shape[0]) - - def set_atom_complex(self, block): - self.kind = block.dtype.name - itemsize = int(self.kind.split("complex")[-1]) // 8 - self.typ = _tables().ComplexCol(itemsize=itemsize, shape=block.shape[0]) - self.set_data(block.values.astype(self.typ.type, copy=False)) - - def set_atom_data(self, block): - self.kind = block.dtype.name - self.typ = self.get_atom_data(block) - self.set_data(block.values.astype(self.typ.type, copy=False)) - - def set_atom_categorical(self, block, items, info=None, values=None): - # currently only supports a 1-D categorical - # in a 1-D block - - values = block.values - codes = values.codes - self.kind = "integer" - self.dtype = codes.dtype.name - if values.ndim > 1: - raise NotImplementedError("only support 1-d categoricals") - if len(items) > 1: - raise NotImplementedError("only support single block categoricals") - - # write the codes; must be in a block shape - self.ordered = values.ordered - self.typ = self.get_atom_data(block, kind=codes.dtype.name) - self.set_data(_block_shape(codes)) - - # write the categories - self.meta = "category" - self.set_metadata(block.values.categories) - - # update the info - self.update_info(info) - - def get_atom_datetime64(self, block): - return _tables().Int64Col(shape=block.shape[0]) - - def set_atom_datetime64(self, block, values=None): - self.kind = "datetime64" - self.typ = self.get_atom_datetime64(block) - if values is None: - values = block.values.view("i8") - self.set_data(values, "datetime64") - - def set_atom_datetime64tz(self, block, info, values=None): - - if values is None: - values = block.values - - # convert this column to i8 in UTC, and save the tz - values = values.asi8.reshape(block.shape) - - # store a converted timezone - self.tz = _get_tz(block.values.tz) - self.update_info(info) - - self.kind = "datetime64" - self.typ = self.get_atom_datetime64(block) - self.set_data(values, "datetime64") - - def get_atom_timedelta64(self, block): - return _tables().Int64Col(shape=block.shape[0]) - - def set_atom_timedelta64(self, block, values=None): - self.kind = "timedelta64" - self.typ = self.get_atom_timedelta64(block) - if values is None: - values = block.values.view("i8") - self.set_data(values, "timedelta64") + @classmethod + def get_atom_data(cls, shape, kind: str) -> "Col": + return cls.get_atom_coltype(kind=kind)(shape=shape[0]) + + @classmethod + def get_atom_datetime64(cls, shape): + return _tables().Int64Col(shape=shape[0]) + + @classmethod + def get_atom_timedelta64(cls, shape): + return _tables().Int64Col(shape=shape[0]) @property def shape(self): @@ -2348,99 +2317,115 @@ def validate_attr(self, append): "items dtype in table!" ) - def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): - """set the data from this selection (and convert to the correct dtype - if we can) + def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): + """ + Convert the data from this selection to the appropriate pandas type. + + Parameters + ---------- + values : np.ndarray + nan_rep : + encoding : str + errors : str + + Returns + ------- + index : listlike to become an Index + data : ndarraylike to become a column """ + assert isinstance(values, np.ndarray), type(values) # values is a recarray if values.dtype.fields is not None: values = values[self.cname] - self.set_data(values) + assert self.typ is not None + if self.dtype is None: + # Note: in tests we never have timedelta64 or datetime64, + # so the _get_data_and_dtype_name may be unnecessary + converted, dtype_name = _get_data_and_dtype_name(values) + kind = _dtype_to_kind(dtype_name) + else: + converted = values + dtype_name = self.dtype + kind = self.kind + + assert isinstance(converted, np.ndarray) # for mypy # use the meta if needed meta = _ensure_decoded(self.meta) + metadata = self.metadata + ordered = self.ordered + tz = self.tz + assert dtype_name is not None # convert to the correct dtype - if self.dtype is not None: - dtype = _ensure_decoded(self.dtype) + dtype = _ensure_decoded(dtype_name) - # reverse converts - if dtype == "datetime64": + # reverse converts + if dtype == "datetime64": - # recreate with tz if indicated - self.data = _set_tz(self.data, self.tz, coerce=True) + # recreate with tz if indicated + converted = _set_tz(converted, tz, coerce=True) - elif dtype == "timedelta64": - self.data = np.asarray(self.data, dtype="m8[ns]") - elif dtype == "date": - try: - self.data = np.asarray( - [date.fromordinal(v) for v in self.data], dtype=object - ) - except ValueError: - self.data = np.asarray( - [date.fromtimestamp(v) for v in self.data], dtype=object - ) - elif dtype == "datetime": - self.data = np.asarray( - [datetime.fromtimestamp(v) for v in self.data], dtype=object + elif dtype == "timedelta64": + converted = np.asarray(converted, dtype="m8[ns]") + elif dtype == "date": + try: + converted = np.asarray( + [date.fromordinal(v) for v in converted], dtype=object ) - - elif meta == "category": - - # we have a categorical - categories = self.metadata - codes = self.data.ravel() - - # if we have stored a NaN in the categories - # then strip it; in theory we could have BOTH - # -1s in the codes and nulls :< - if categories is None: - # Handle case of NaN-only categorical columns in which case - # the categories are an empty array; when this is stored, - # pytables cannot write a zero-len array, so on readback - # the categories would be None and `read_hdf()` would fail. - categories = Index([], dtype=np.float64) - else: - mask = isna(categories) - if mask.any(): - categories = categories[~mask] - codes[codes != -1] -= mask.astype(int).cumsum().values - - self.data = Categorical.from_codes( - codes, categories=categories, ordered=self.ordered + except ValueError: + converted = np.asarray( + [date.fromtimestamp(v) for v in converted], dtype=object ) + elif meta == "category": + + # we have a categorical + categories = metadata + codes = converted.ravel() + + # if we have stored a NaN in the categories + # then strip it; in theory we could have BOTH + # -1s in the codes and nulls :< + if categories is None: + # Handle case of NaN-only categorical columns in which case + # the categories are an empty array; when this is stored, + # pytables cannot write a zero-len array, so on readback + # the categories would be None and `read_hdf()` would fail. + categories = Index([], dtype=np.float64) else: + mask = isna(categories) + if mask.any(): + categories = categories[~mask] + codes[codes != -1] -= mask.astype(int).cumsum().values - try: - self.data = self.data.astype(dtype, copy=False) - except TypeError: - self.data = self.data.astype("O", copy=False) - - # convert nans / decode - if _ensure_decoded(self.kind) == "string": - self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding, errors=errors + converted = Categorical.from_codes( + codes, categories=categories, ordered=ordered ) - return self + else: - def get_attr(self): - """ get the data for this column """ - self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) - self.meta = getattr(self.attrs, self.meta_attr, None) - self.set_kind() + try: + converted = converted.astype(dtype, copy=False) + except TypeError: + converted = converted.astype("O", copy=False) + + # convert nans / decode + if _ensure_decoded(kind) == "string": + converted = _unconvert_string_array( + converted, nan_rep=nan_rep, encoding=encoding, errors=errors + ) + + return self.values, converted def set_attr(self): """ set the data for this column """ setattr(self.attrs, self.kind_attr, self.values) setattr(self.attrs, self.meta_attr, self.meta) - if self.dtype is not None: - setattr(self.attrs, self.dtype_attr, self.dtype) + assert self.dtype is not None + setattr(self.attrs, self.dtype_attr, self.dtype) class DataIndexableCol(DataCol): @@ -2450,26 +2435,30 @@ class DataIndexableCol(DataCol): def validate_names(self): if not Index(self.values).is_object(): + # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") - def get_atom_string(self, block, itemsize): + @classmethod + def get_atom_string(cls, shape, itemsize): return _tables().StringCol(itemsize=itemsize) - def get_atom_data(self, block, kind=None): - return self.get_atom_coltype(kind=kind)() + @classmethod + def get_atom_data(cls, shape, kind: str) -> "Col": + return cls.get_atom_coltype(kind=kind)() - def get_atom_datetime64(self, block): + @classmethod + def get_atom_datetime64(cls, shape): return _tables().Int64Col() - def get_atom_timedelta64(self, block): + @classmethod + def get_atom_timedelta64(cls, shape): return _tables().Int64Col() class GenericDataIndexableCol(DataIndexableCol): """ represent a generic pytables data column """ - def get_attr(self): - pass + pass class Fixed: @@ -2479,71 +2468,75 @@ class Fixed: Parameters ---------- - - parent : my parent HDFStore - group : the group node where the table resides + parent : HDFStore + group : Node + The group node where the table resides. """ - pandas_kind = None # type: str - obj_type = None # type: Type[Union[DataFrame, Series]] - ndim = None # type: int + pandas_kind: str + obj_type: Type[Union[DataFrame, Series]] + ndim: int + encoding: str + parent: HDFStore + group: "Node" + errors: str is_table = False - def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): + def __init__( + self, + parent: HDFStore, + group: "Node", + encoding: str = "UTF-8", + errors: str = "strict", + ): + assert isinstance(parent, HDFStore), type(parent) + assert _table_mod is not None # needed for mypy + assert isinstance(group, _table_mod.Node), type(group) self.parent = parent self.group = group self.encoding = _ensure_encoding(encoding) self.errors = errors - self.set_version() @property - def is_old_version(self): + def is_old_version(self) -> bool: return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 - def set_version(self): + @property + def version(self) -> Tuple[int, int, int]: """ compute and set our version """ version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: - self.version = tuple(int(x) for x in version.split(".")) - if len(self.version) == 2: - self.version = self.version + (0,) + version = tuple(int(x) for x in version.split(".")) + if len(version) == 2: + version = version + (0,) except AttributeError: - self.version = (0, 0, 0) + version = (0, 0, 0) + return version @property def pandas_type(self): return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) - @property - def format_type(self): - return "fixed" - - def __repr__(self): + def __repr__(self) -> str: """ return a pretty representation of myself """ self.infer_axes() s = self.shape if s is not None: if isinstance(s, (list, tuple)): - s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s)) - return "{type:12.12} (shape->{shape})".format( - type=self.pandas_type, shape=s - ) + jshape = ",".join(pprint_thing(x) for x in s) + s = f"[{jshape}]" + return f"{self.pandas_type:12.12} (shape->{s})" return self.pandas_type def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = str(self.pandas_kind) self.attrs.pandas_version = str(_version) - self.set_version() def copy(self): new_self = copy.copy(self) return new_self - @property - def storage_obj_type(self): - return self.obj_type - @property def shape(self): return self.nrows @@ -2561,17 +2554,13 @@ def _filters(self): return self.parent._filters @property - def _complevel(self): + def _complevel(self) -> int: return self.parent._complevel @property - def _fletcher32(self): + def _fletcher32(self) -> bool: return self.parent._fletcher32 - @property - def _complib(self): - return self.parent._complib - @property def attrs(self): return self.group._v_attrs @@ -2590,7 +2579,7 @@ def storable(self): return self.group @property - def is_exists(self): + def is_exists(self) -> bool: return False @property @@ -2617,17 +2606,25 @@ def infer_axes(self): self.get_attrs() return True - def read(self, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): raise NotImplementedError( "cannot read on an abstract storer: subclasses should implement" ) def write(self, **kwargs): raise NotImplementedError( - "cannot write on an abstract storer: sublcasses should implement" + "cannot write on an abstract storer: subclasses should implement" ) - def delete(self, where=None, start=None, stop=None, **kwargs): + def delete( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None + ): """ support fully deleting the node in its entirety (only) - where specification must be None @@ -2644,10 +2641,10 @@ class GenericFixed(Fixed): _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} - attributes = [] # type: List[str] + attributes: List[str] = [] # indexer helpders - def _class_to_alias(self, cls): + def _class_to_alias(self, cls) -> str: return self._index_type_map.get(cls, "") def _alias_to_class(self, alias): @@ -2676,31 +2673,25 @@ def f(values, freq=None, tz=None): return klass - def validate_read(self, kwargs): + def validate_read(self, columns, where): """ - remove table keywords from kwargs and return raise if any keywords are passed which are not-None """ - kwargs = copy.copy(kwargs) - - columns = kwargs.pop("columns", None) if columns is not None: raise TypeError( "cannot pass a column specification when reading " "a Fixed format store. this store must be " "selected in its entirety" ) - where = kwargs.pop("where", None) if where is not None: raise TypeError( "cannot pass a where specification when reading " "from a Fixed format store. this store must be " "selected in its entirety" ) - return kwargs @property - def is_exists(self): + def is_exists(self) -> bool: return True def set_attrs(self): @@ -2718,7 +2709,9 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key, start=None, stop=None): + def read_array( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ): """ read an array for the specified node (off of group """ import tables @@ -2742,7 +2735,8 @@ def read_array(self, key, start=None, stop=None): if dtype == "datetime64": # reconstruct a timezone if indicated - ret = _set_tz(ret, getattr(attrs, "tz", None), coerce=True) + tz = getattr(attrs, "tz", None) + ret = _set_tz(ret, tz, coerce=True) elif dtype == "timedelta64": ret = np.asarray(ret, dtype="m8[ns]") @@ -2752,38 +2746,27 @@ def read_array(self, key, start=None, stop=None): else: return ret - def read_index(self, key, **kwargs): - variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key))) + def read_index( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ) -> Index: + variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) if variety == "multi": - return self.read_multi_index(key, **kwargs) - elif variety == "block": - return self.read_block_index(key, **kwargs) - elif variety == "sparseint": - return self.read_sparse_intindex(key, **kwargs) + return self.read_multi_index(key, start=start, stop=stop) elif variety == "regular": - _, index = self.read_index_node(getattr(self.group, key), **kwargs) + node = getattr(self.group, key) + index = self.read_index_node(node, start=start, stop=stop) return index else: # pragma: no cover - raise TypeError( - "unrecognized index variety: {variety}".format(variety=variety) - ) + raise TypeError(f"unrecognized index variety: {variety}") - def write_index(self, key, index): + def write_index(self, key: str, index: Index): if isinstance(index, MultiIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "multi") + setattr(self.attrs, f"{key}_variety", "multi") self.write_multi_index(key, index) - elif isinstance(index, BlockIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "block") - self.write_block_index(key, index) - elif isinstance(index, IntIndex): - setattr(self.attrs, "{key}_variety".format(key=key), "sparseint") - self.write_sparse_intindex(key, index) else: - setattr(self.attrs, "{key}_variety".format(key=key), "regular") - converted = _convert_index( - index, self.encoding, self.errors, self.format_type - ).set_name("index") + setattr(self.attrs, f"{key}_variety", "regular") + converted = _convert_index("index", index, self.encoding, self.errors) self.write_array(key, converted.values) @@ -2794,90 +2777,68 @@ def write_index(self, key, index): if isinstance(index, (DatetimeIndex, PeriodIndex)): node._v_attrs.index_class = self._class_to_alias(type(index)) - if hasattr(index, "freq"): + if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): node._v_attrs.freq = index.freq - if hasattr(index, "tz") and index.tz is not None: + if isinstance(index, DatetimeIndex) and index.tz is not None: node._v_attrs.tz = _get_tz(index.tz) - def write_block_index(self, key, index): - self.write_array("{key}_blocs".format(key=key), index.blocs) - self.write_array("{key}_blengths".format(key=key), index.blengths) - setattr(self.attrs, "{key}_length".format(key=key), index.length) - - def read_block_index(self, key, **kwargs): - length = getattr(self.attrs, "{key}_length".format(key=key)) - blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) - blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) - return BlockIndex(length, blocs, blengths) - - def write_sparse_intindex(self, key, index): - self.write_array("{key}_indices".format(key=key), index.indices) - setattr(self.attrs, "{key}_length".format(key=key), index.length) - - def read_sparse_intindex(self, key, **kwargs): - length = getattr(self.attrs, "{key}_length".format(key=key)) - indices = self.read_array("{key}_indices".format(key=key), **kwargs) - return IntIndex(length, indices) - - def write_multi_index(self, key, index): - setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels) + def write_multi_index(self, key: str, index: MultiIndex): + setattr(self.attrs, f"{key}_nlevels", index.nlevels) for i, (lev, level_codes, name) in enumerate( zip(index.levels, index.codes, index.names) ): # write the level - if is_extension_type(lev): + if is_extension_array_dtype(lev): raise NotImplementedError( "Saving a MultiIndex with an extension dtype is not supported." ) - level_key = "{key}_level{idx}".format(key=key, idx=i) - conv_level = _convert_index( - lev, self.encoding, self.errors, self.format_type - ).set_name(level_key) + level_key = f"{key}_level{i}" + conv_level = _convert_index(level_key, lev, self.encoding, self.errors) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name - setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name) + setattr(node._v_attrs, f"{key}_name{name}", name) # write the labels - label_key = "{key}_label{idx}".format(key=key, idx=i) + label_key = f"{key}_label{i}" self.write_array(label_key, level_codes) - def read_multi_index(self, key, **kwargs): - nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) + def read_multi_index( + self, key: str, start: Optional[int] = None, stop: Optional[int] = None + ) -> MultiIndex: + nlevels = getattr(self.attrs, f"{key}_nlevels") levels = [] codes = [] - names = [] + names: List[Optional[Hashable]] = [] for i in range(nlevels): - level_key = "{key}_level{idx}".format(key=key, idx=i) - name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs) + level_key = f"{key}_level{i}" + node = getattr(self.group, level_key) + lev = self.read_index_node(node, start=start, stop=stop) levels.append(lev) - names.append(name) + names.append(lev.name) - label_key = "{key}_label{idx}".format(key=key, idx=i) - level_codes = self.read_array(label_key, **kwargs) + label_key = f"{key}_label{i}" + level_codes = self.read_array(label_key, start=start, stop=stop) codes.append(level_codes) return MultiIndex( levels=levels, codes=codes, names=names, verify_integrity=True ) - def read_index_node(self, node, start=None, stop=None): + def read_index_node( + self, node: "Node", start: Optional[int] = None, stop: Optional[int] = None + ) -> Index: data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if "shape" in node._v_attrs and self._is_empty_array( - getattr(node._v_attrs, "shape") - ): - data = np.empty( - getattr(node._v_attrs, "shape"), - dtype=getattr(node._v_attrs, "value_type"), - ) + if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -2902,45 +2863,47 @@ def read_index_node(self, node, start=None, stop=None): # created by python3 kwargs["tz"] = node._v_attrs["tz"] - if kind in ("date", "datetime"): + if kind == "date": index = factory( _unconvert_index( data, kind, encoding=self.encoding, errors=self.errors ), dtype=object, - **kwargs + **kwargs, ) else: index = factory( _unconvert_index( data, kind, encoding=self.encoding, errors=self.errors ), - **kwargs + **kwargs, ) index.name = name - return name, index + return index - def write_array_empty(self, key, value): + def write_array_empty(self, key: str, value: ArrayLike): """ write a 0-len array """ # ugly hack for length 0 axes arr = np.empty((1,) * value.ndim) self._handle.create_array(self.group, key, arr) - getattr(self.group, key)._v_attrs.value_type = str(value.dtype) - getattr(self.group, key)._v_attrs.shape = value.shape + node = getattr(self.group, key) + node._v_attrs.value_type = str(value.dtype) + node._v_attrs.shape = value.shape - def _is_empty_array(self, shape): - """Returns true if any axis is zero length.""" - return any(x == 0 for x in shape) + def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None): + # TODO: we only have one test that gets here, the only EA + # that gets passed is DatetimeArray, and we never have + # both self._filters and EA + assert isinstance(value, (np.ndarray, ABCExtensionArray)), type(value) - def write_array(self, key, value, items=None): if key in self.group: self._handle.remove_node(self.group, key) # Transform needed to interface with pytables row/col notation - empty_array = self._is_empty_array(value.shape) + empty_array = value.size == 0 transposed = False if is_categorical_dtype(value): @@ -2955,29 +2918,29 @@ def write_array(self, key, value, items=None): value = value.T transposed = True + atom = None if self._filters is not None: - atom = None try: # get the atom for this datatype atom = _tables().Atom.from_dtype(value.dtype) except ValueError: pass - if atom is not None: - # create an empty chunked array and fill it from value - if not empty_array: - ca = self._handle.create_carray( - self.group, key, atom, value.shape, filters=self._filters - ) - ca[:] = value - getattr(self.group, key)._v_attrs.transposed = transposed + if atom is not None: + # We only get here if self._filters is non-None and + # the Atom.from_dtype call succeeded - else: - self.write_array_empty(key, value) + # create an empty chunked array and fill it from value + if not empty_array: + ca = self._handle.create_carray( + self.group, key, atom, value.shape, filters=self._filters + ) + ca[:] = value - return + else: + self.write_array_empty(key, value) - if value.dtype.type == np.object_: + elif value.dtype.type == np.object_: # infer the type, warn if we have a non-string type here (for # performance) @@ -2987,81 +2950,57 @@ def write_array(self, key, value, items=None): elif inferred_type == "string": pass else: - try: - items = list(items) - except TypeError: - pass ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning, stacklevel=7) vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) - else: - if empty_array: - self.write_array_empty(key, value) - else: - if is_datetime64_dtype(value.dtype): - self._handle.create_array(self.group, key, value.view("i8")) - getattr(self.group, key)._v_attrs.value_type = "datetime64" - elif is_datetime64tz_dtype(value.dtype): - # store as UTC - # with a zone - self._handle.create_array(self.group, key, value.asi8) - - node = getattr(self.group, key) - node._v_attrs.tz = _get_tz(value.tz) - node._v_attrs.value_type = "datetime64" - elif is_timedelta64_dtype(value.dtype): - self._handle.create_array(self.group, key, value.view("i8")) - getattr(self.group, key)._v_attrs.value_type = "timedelta64" - else: - self._handle.create_array(self.group, key, value) - - getattr(self.group, key)._v_attrs.transposed = transposed + elif empty_array: + self.write_array_empty(key, value) + elif is_datetime64_dtype(value.dtype): + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "datetime64" + elif is_datetime64tz_dtype(value.dtype): + # store as UTC + # with a zone + self._handle.create_array(self.group, key, value.asi8) -class LegacyFixed(GenericFixed): - def read_index_legacy(self, key, start=None, stop=None): - node = getattr(self.group, key) - data = node[start:stop] - kind = node._v_attrs.kind - return _unconvert_index_legacy( - data, kind, encoding=self.encoding, errors=self.errors - ) - - -class LegacySeriesFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - values = self.read_array("values") - return Series(values, index=index) - + node = getattr(self.group, key) + node._v_attrs.tz = _get_tz(value.tz) + node._v_attrs.value_type = "datetime64" + elif is_timedelta64_dtype(value.dtype): + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "timedelta64" + else: + self._handle.create_array(self.group, key, value) -class LegacyFrameFixed(LegacyFixed): - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index_legacy("index") - columns = self.read_index_legacy("columns") - values = self.read_array("values") - return DataFrame(values, index=index, columns=columns) + getattr(self.group, key)._v_attrs.transposed = transposed class SeriesFixed(GenericFixed): pandas_kind = "series" attributes = ["name"] + name: Optional[Hashable] + @property def shape(self): try: - return (len(getattr(self.group, "values")),) + return (len(self.group.values),) except (TypeError, AttributeError): return None - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - index = self.read_index("index", **kwargs) - values = self.read_array("values", **kwargs) + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): + self.validate_read(columns, where) + index = self.read_index("index", start=start, stop=stop) + values = self.read_array("values", start=start, stop=stop) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -3073,7 +3012,8 @@ def write(self, obj, **kwargs): class BlockManagerFixed(GenericFixed): attributes = ["ndim", "nblocks"] - is_shape_reversed = False + + nblocks: int @property def shape(self): @@ -3083,13 +3023,13 @@ def shape(self): # items items = 0 for i in range(self.nblocks): - node = getattr(self.group, "block{idx}_items".format(idx=i)) + node = getattr(self.group, f"block{i}_items") shape = getattr(node, "shape", None) if shape is not None: items += shape[0] # data shape - node = getattr(self.group, "block0_values") + node = self.group.block0_values shape = getattr(node, "shape", None) if shape is not None: shape = list(shape[0 : (ndim - 1)]) @@ -3098,41 +3038,46 @@ def shape(self): shape.append(items) - # hacky - this works for frames, but is reversed for panels - if self.is_shape_reversed: - shape = shape[::-1] - return shape except AttributeError: return None - def read(self, start=None, stop=None, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): # start, stop applied to rows, so 0th axis only - - kwargs = self.validate_read(kwargs) + self.validate_read(columns, where) select_axis = self.obj_type()._get_block_manager_axis(0) axes = [] for i in range(self.ndim): _start, _stop = (start, stop) if i == select_axis else (None, None) - ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop) + ax = self.read_index(f"axis{i}", start=_start, stop=_stop) axes.append(ax) items = axes[0] - blocks = [] + dfs = [] + for i in range(self.nblocks): - blk_items = self.read_index("block{idx}_items".format(idx=i)) - values = self.read_array( - "block{idx}_values".format(idx=i), start=_start, stop=_stop - ) - blk = make_block( - values, placement=items.get_indexer(blk_items), ndim=len(axes) - ) - blocks.append(blk) + blk_items = self.read_index(f"block{i}_items") + values = self.read_array(f"block{i}_values", start=_start, stop=_stop) + + columns = items[items.get_indexer(blk_items)] + df = DataFrame(values.T, columns=columns, index=axes[1]) + dfs.append(df) + + if len(dfs) > 0: + out = concat(dfs, axis=1) + out = out.reindex(columns=items, copy=False) + return out - return self.obj_type(BlockManager(blocks, axes)) + return DataFrame(columns=axes[0], index=axes[1]) def write(self, obj, **kwargs): super().write(obj, **kwargs) @@ -3145,17 +3090,15 @@ def write(self, obj, **kwargs): if i == 0: if not ax.is_unique: raise ValueError("Columns index has to be unique for fixed format") - self.write_index("axis{idx}".format(idx=i), ax) + self.write_index(f"axis{i}", ax) # Supporting mixed-type DataFrame objects...nontrivial self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 blk_items = data.items.take(blk.mgr_locs) - self.write_array( - "block{idx}_values".format(idx=i), blk.values, items=blk_items - ) - self.write_index("block{idx}_items".format(idx=i), blk_items) + self.write_array(f"block{i}_values", blk.values, items=blk_items) + self.write_index(f"block{i}_items", blk_items) class FrameFixed(BlockManagerFixed): @@ -3189,55 +3132,61 @@ class Table(Fixed): """ pandas_kind = "wide_table" - table_type = None # type: str + table_type: str levels = 1 is_table = True - is_shape_reversed = False - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.index_axes = [] - self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.metadata = [] - self.info = dict() - self.nan_rep = None - self.selection = None + index_axes: List[IndexCol] + non_index_axes: List[Tuple[int, Any]] + values_axes: List[DataCol] + data_columns: List + metadata: List + info: Dict - @property - def table_type_short(self): - return self.table_type.split("_")[0] + def __init__( + self, + parent: HDFStore, + group: "Node", + encoding=None, + errors: str = "strict", + index_axes=None, + non_index_axes=None, + values_axes=None, + data_columns=None, + info=None, + nan_rep=None, + ): + super().__init__(parent, group, encoding=encoding, errors=errors) + self.index_axes = index_axes or [] + self.non_index_axes = non_index_axes or [] + self.values_axes = values_axes or [] + self.data_columns = data_columns or [] + self.info = info or dict() + self.nan_rep = nan_rep @property - def format_type(self): - return "table" + def table_type_short(self) -> str: + return self.table_type.split("_")[0] - def __repr__(self): + def __repr__(self) -> str: """ return a pretty representation of myself """ self.infer_axes() - dc = ",dc->[{columns}]".format( - columns=(",".join(self.data_columns) if len(self.data_columns) else "") - ) + jdc = ",".join(self.data_columns) if len(self.data_columns) else "" + dc = f",dc->[{jdc}]" ver = "" if self.is_old_version: - ver = "[{version}]".format(version=".".join(str(x) for x in self.version)) + jver = ".".join(str(x) for x in self.version) + ver = f"[{jver}]" + jindex_axes = ",".join(a.name for a in self.index_axes) return ( - "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows}," - "ncols->{ncols},indexers->[{index_axes}]{dc})".format( - pandas_type=self.pandas_type, - ver=ver, - table_type=self.table_type_short, - nrows=self.nrows, - ncols=self.ncols, - index_axes=(",".join(a.name for a in self.index_axes)), - dc=dc, - ) + f"{self.pandas_type:12.12}{ver} " + f"(typ->{self.table_type_short},nrows->{self.nrows}," + f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" ) - def __getitem__(self, c): + def __getitem__(self, c: str): """ return the axis for c """ for a in self.axes: if c == a.name: @@ -3252,9 +3201,7 @@ def validate(self, other): if other.table_type != self.table_type: raise TypeError( "incompatible table_type with existing " - "[{other} - {self}]".format( - other=other.table_type, self=self.table_type - ) + f"[{other.table_type} - {self.table_type}]" ) for c in ["index_axes", "non_index_axes", "values_axes"]: @@ -3267,34 +3214,27 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [{c}] on appending data " - "[{sax}] vs current table [{oax}]".format( - c=c, sax=sax, oax=oax - ) + f"invalid combination of [{c}] on appending data " + f"[{sax}] vs current table [{oax}]" ) # should never get here raise Exception( - "invalid combinate of [{c}] on appending data [{sv}] vs " - "current table [{ov}]".format(c=c, sv=sv, ov=ov) + f"invalid combination of [{c}] on appending data [{sv}] vs " + f"current table [{ov}]" ) @property - def is_multi_index(self): + def is_multi_index(self) -> bool: """the levels attribute is 1 or a list in the case of a multi-index""" return isinstance(self.levels, list) - def validate_metadata(self, existing): - """ create / validate metadata """ - self.metadata = [c.name for c in self.values_axes if c.metadata is not None] - def validate_multiindex(self, obj): """validate that we can store the multi-index; reset and return the new object """ levels = [ - l if l is not None else "level_{0}".format(i) - for i, l in enumerate(obj.index.names) + l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names) ] try: return obj.reset_index(), levels @@ -3304,12 +3244,12 @@ def validate_multiindex(self, obj): ) @property - def nrows_expected(self): + def nrows_expected(self) -> int: """ based on our axes, compute the expected nrows """ return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property - def is_exists(self): + def is_exists(self) -> bool: """ has this table been created """ return "table" in self.group @@ -3335,12 +3275,12 @@ def axes(self): return itertools.chain(self.index_axes, self.values_axes) @property - def ncols(self): + def ncols(self) -> int: """ the number of total columns in the values axes """ return sum(len(a.values) for a in self.values_axes) @property - def is_transposed(self): + def is_transposed(self) -> bool: return False @property @@ -3353,44 +3293,45 @@ def data_orientation(self): ) ) - def queryables(self): + def queryables(self) -> Dict[str, Any]: """ return a dict of the kinds allowable columns for this object """ + # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here + axis_names = {0: "index", 1: "columns"} + # compute the values_axes queryables - return dict( - [(a.cname, a) for a in self.index_axes] - + [ - (self.storage_obj_type._AXIS_NAMES[axis], None) - for axis, values in self.non_index_axes - ] - + [ - (v.cname, v) - for v in self.values_axes - if v.name in set(self.data_columns) - ] - ) + d1 = [(a.cname, a) for a in self.index_axes] + d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] + d3 = [ + (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) + ] + + return dict(d1 + d2 + d3) # type: ignore + # error: List comprehension has incompatible type + # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]] def index_cols(self): """ return a list of my index cols """ + # Note: each `i.cname` below is assured to be a str. return [(i.axis, i.cname) for i in self.index_axes] - def values_cols(self): + def values_cols(self) -> List[str]: """ return a list of my values cols """ return [i.cname for i in self.values_axes] - def _get_metadata_path(self, key): + def _get_metadata_path(self, key: str) -> str: """ return the metadata pathname for this key """ - return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) + group = self.group._v_pathname + return f"{group}/meta/{key}/meta" - def write_metadata(self, key, values): + def write_metadata(self, key: str, values: np.ndarray): """ - write out a meta data array to the key as a fixed-format Series + Write out a metadata array to the key as a fixed-format Series. Parameters ---------- - key : string + key : str values : ndarray - """ values = Series(values) self.parent.put( @@ -3402,16 +3343,12 @@ def write_metadata(self, key, values): nan_rep=self.nan_rep, ) - def read_metadata(self, key): + def read_metadata(self, key: str): """ return the meta data array for this key """ if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None - def set_info(self): - """ update our table index info """ - self.attrs.info = self.info - def set_attrs(self): """ set our table type & indexables """ self.attrs.table_type = str(self.table_type) @@ -3423,8 +3360,7 @@ def set_attrs(self): self.attrs.encoding = self.encoding self.attrs.errors = self.errors self.attrs.levels = self.levels - self.attrs.metadata = self.metadata - self.set_info() + self.attrs.info = self.info def get_attrs(self): """ retrieve our attributes """ @@ -3435,11 +3371,8 @@ def get_attrs(self): self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) self.levels = getattr(self.attrs, "levels", None) or [] - self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] - self.values_axes = [ - a.infer(self) for a in self.indexables if not a.is_an_indexable - ] - self.metadata = getattr(self.attrs, "metadata", None) or [] + self.index_axes = [a for a in self.indexables if a.is_an_indexable] + self.values_axes = [a for a in self.indexables if not a.is_an_indexable] def validate_version(self, where=None): """ are we trying to operate on an old version? """ @@ -3465,60 +3398,111 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [{key}] which is not an axis or " - "data_column".format(key=k) + f"min_itemsize has the key [{k}] which is not an axis or " + "data_column" ) - @property + @cache_readonly def indexables(self): """ create/cache the indexables if they don't exist """ - if self._indexables is None: - - self._indexables = [] - - # index columns - self._indexables.extend( - [ - IndexCol(name=name, axis=axis, pos=i) - for i, (axis, name) in enumerate(self.attrs.index_cols) - ] + _indexables = [] + + desc = self.description + table_attrs = self.table.attrs + + # Note: each of the `name` kwargs below are str, ensured + # by the definition in index_cols. + # index columns + for i, (axis, name) in enumerate(self.attrs.index_cols): + atom = getattr(desc, name) + md = self.read_metadata(name) + meta = "category" if md is not None else None + + kind_attr = f"{name}_kind" + kind = getattr(table_attrs, kind_attr, None) + + index_col = IndexCol( + name=name, + axis=axis, + pos=i, + kind=kind, + typ=atom, + table=self.table, + meta=meta, + metadata=md, ) + _indexables.append(index_col) - # values columns - dc = set(self.data_columns) - base_pos = len(self._indexables) + # values columns + dc = set(self.data_columns) + base_pos = len(_indexables) - def f(i, c): - klass = DataCol - if c in dc: - klass = DataIndexableCol - return klass.create_for_block( - i=i, name=c, pos=base_pos + i, version=self.version - ) + def f(i, c): + assert isinstance(c, str) + klass = DataCol + if c in dc: + klass = DataIndexableCol - self._indexables.extend( - [f(i, c) for i, c in enumerate(self.attrs.values_cols)] + atom = getattr(desc, c) + adj_name = _maybe_adjust_name(c, self.version) + + # TODO: why kind_attr here? + values = getattr(table_attrs, f"{adj_name}_kind", None) + dtype = getattr(table_attrs, f"{adj_name}_dtype", None) + kind = _dtype_to_kind(dtype) + + md = self.read_metadata(c) + # TODO: figure out why these two versions of `meta` dont always match. + # meta = "category" if md is not None else None + meta = getattr(table_attrs, f"{adj_name}_meta", None) + + obj = klass( + name=adj_name, + cname=c, + values=values, + kind=kind, + pos=base_pos + i, + typ=atom, + table=self.table, + meta=meta, + metadata=md, + dtype=dtype, ) + return obj - return self._indexables + # Note: the definition of `values_cols` ensures that each + # `c` below is a str. + _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) - def create_index(self, columns=None, optlevel=None, kind=None): + return _indexables + + def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): """ - Create a pytables index on the specified columns - note: cannot index Time64Col() or ComplexCol currently; - PyTables must be >= 3.0 + Create a pytables index on the specified columns. Parameters ---------- - columns : False (don't create an index), True (create all columns - index), None or list_like (the indexers to index) - optlevel: optimization level (defaults to 6) - kind : kind of index (defaults to 'medium') + columns : None, bool, or listlike[str] + Indicate which columns to create an index on. + + * False : Do not create any indexes. + * True : Create indexes on all columns. + * None : Create indexes on all columns. + * listlike : Create indexes on the given columns. + + optlevel : int or None, default None + Optimization level, if None, pytables defaults to 6. + kind : str or None, default None + Kind of index, if None, pytables defaults to "medium". Raises ------ - raises if the node is not a table + TypeError if trying to create an index on a complex-type column. + Notes + ----- + Cannot index Time64Col or ComplexCol. + Pytables must be >= 3.0. """ if not self.infer_axes(): @@ -3563,65 +3547,68 @@ def create_index(self, columns=None, optlevel=None, kind=None): if not v.is_indexed: if v.type.startswith("complex"): raise TypeError( - "Columns containing complex values can be stored " - "but cannot" - " be indexed when using table format. Either use " + "Columns containing complex values can be stored but " + "cannot be indexed when using table format. Either use " "fixed format, set index=False, or do not include " "the columns containing complex values to " "data_columns when initializing the table." ) v.create_index(**kw) - def read_axes(self, where, **kwargs): - """create and return the axes sniffed from the table: return boolean - for success + def _read_axes( + self, where, start: Optional[int] = None, stop: Optional[int] = None + ) -> List[Tuple[ArrayLike, ArrayLike]]: """ + Create the axes sniffed from the table. - # validate the version - self.validate_version(where) + Parameters + ---------- + where : ??? + start : int or None, default None + stop : int or None, default None - # infer the data kind - if not self.infer_axes(): - return False + Returns + ------- + List[Tuple[index_values, column_values]] + """ # create the selection - self.selection = Selection(self, where=where, **kwargs) - values = self.selection.select() + selection = Selection(self, where=where, start=start, stop=stop) + values = selection.select() + results = [] # convert the data for a in self.axes: a.set_info(self.info) - # `kwargs` may contain `start` and `stop` arguments if passed to - # `store.select()`. If set they determine the index size. - a.convert( + res = a.convert( values, nan_rep=self.nan_rep, encoding=self.encoding, errors=self.errors, - start=kwargs.get("start"), - stop=kwargs.get("stop"), ) + results.append(res) - return True + return results - def get_object(self, obj): + @classmethod + def get_object(cls, obj, transposed: bool): """ return the data for this obj """ return obj - def validate_data_columns(self, data_columns, min_itemsize): + def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): """take the input data_columns and min_itemize and create a data columns spec """ - if not len(self.non_index_axes): + if not len(non_index_axes): return [] - axis, axis_labels = self.non_index_axes[0] + axis, axis_labels = non_index_axes[0] info = self.info.get(axis, dict()) if info.get("type") == "MultiIndex" and data_columns: raise ValueError( - "cannot use a multi-index on axis [{0}] with " - "data_columns {1}".format(axis, data_columns) + f"cannot use a multi-index on axis [{axis}] with " + f"data_columns {data_columns}" ) # evaluate the passed data_columns, True == use all columns @@ -3635,6 +3622,7 @@ def validate_data_columns(self, data_columns, min_itemsize): if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) + data_columns = list(data_columns) # ensure we do not modify data_columns.extend( [ k @@ -3646,61 +3634,66 @@ def validate_data_columns(self, data_columns, min_itemsize): # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes( + def _create_axes( self, axes, - obj, - validate=True, + obj: DataFrame, + validate: bool = True, nan_rep=None, data_columns=None, min_itemsize=None, - **kwargs ): - """ create and return the axes - legacy tables create an indexable column, indexable index, - non-indexable fields - - Parameters - ---------- - axes: a list of the axes in order to create (names or numbers of - the axes) - obj : the object to create axes on - validate: validate the obj against an existing object already - written - min_itemsize: a dict of the min size for a column in bytes - nan_rep : a values to use for string column nan_rep - encoding : the encoding for string values - data_columns : a list of columns that we want to create separate to - allow indexing (or True will force all columns) + """ + Create and return the axes. + Parameters + ---------- + axes: list or None + The names or numbers of the axes to create. + obj : DataFrame + The object to create axes on. + validate: bool, default True + Whether to validate the obj against an existing object already written. + nan_rep : + A value to use for string column nan_rep. + data_columns : List[str], True, or None, default None + Specify the columns that we want to create to allow indexing on. + + * True : Use all available columns. + * None : Use no columns. + * List[str] : Use the specified columns. + + min_itemsize: Dict[str, int] or None, default None + The min itemsize for a column in bytes. """ + if not isinstance(obj, DataFrame): + group = self.group._v_name + raise TypeError( + f"cannot properly create the storer for: [group->{group}," + f"value->{type(obj)}]" + ) + # set the default axes if needed if axes is None: - try: - axes = _AXES_MAP[type(obj)] - except KeyError: - raise TypeError( - "cannot properly create the storer for: [group->{group}," - "value->{value}]".format(group=self.group._v_name, value=type(obj)) - ) + axes = [0] # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] # do we have an existing table (if so, use its axes & data_columns) if self.infer_axes(): - existing_table = self.copy() - existing_table.infer_axes() - axes = [a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep - self.encoding = existing_table.encoding - self.errors = existing_table.errors - self.info = copy.copy(existing_table.info) + table_exists = True + axes = [a.axis for a in self.index_axes] + data_columns = list(self.data_columns) + nan_rep = self.nan_rep + # TODO: do we always have validate=True here? else: - existing_table = None + table_exists = False + + new_info = self.info + assert self.ndim == 2 # with next check, we must have len(axes) == 1 # currently support on ndim-1 axes if len(axes) != self.ndim - 1: raise ValueError( @@ -3708,114 +3701,76 @@ def create_axes( ) # create according to the new data - self.non_index_axes = [] - self.data_columns = [] + new_non_index_axes: List = [] # nan_representation if nan_rep is None: nan_rep = "nan" - self.nan_rep = nan_rep + # We construct the non-index-axis first, since that alters new_info + idx = [x for x in [0, 1] if x not in axes][0] - # create axes to index and non_index - index_axes_map = dict() - for i, a in enumerate(obj.axes): + a = obj.axes[idx] + # we might be able to change the axes on the appending data if necessary + append_axis = list(a) + if table_exists: + indexer = len(new_non_index_axes) # i.e. 0 + exist_axis = self.non_index_axes[indexer][1] + if not array_equivalent(np.array(append_axis), np.array(exist_axis)): - if i in axes: - name = obj._AXIS_NAMES[i] - index_axes_map[i] = ( - _convert_index(a, self.encoding, self.errors, self.format_type) - .set_name(name) - .set_axis(i) - ) - else: + # ahah! -> reindex + if array_equivalent( + np.array(sorted(append_axis)), np.array(sorted(exist_axis)) + ): + append_axis = exist_axis - # we might be able to change the axes on the appending data if - # necessary - append_axis = list(a) - if existing_table is not None: - indexer = len(self.non_index_axes) - exist_axis = existing_table.non_index_axes[indexer][1] - if not array_equivalent( - np.array(append_axis), np.array(exist_axis) - ): - - # ahah! -> reindex - if array_equivalent( - np.array(sorted(append_axis)), np.array(sorted(exist_axis)) - ): - append_axis = exist_axis + # the non_index_axes info + info = new_info.setdefault(idx, {}) + info["names"] = list(a.names) + info["type"] = type(a).__name__ - # the non_index_axes info - info = _get_info(self.info, i) - info["names"] = list(a.names) - info["type"] = a.__class__.__name__ + new_non_index_axes.append((idx, append_axis)) - self.non_index_axes.append((i, append_axis)) + # Now we can construct our new index axis + idx = axes[0] + a = obj.axes[idx] + axis_name = obj._AXIS_NAMES[idx] + new_index = _convert_index(axis_name, a, self.encoding, self.errors) + new_index.axis = idx - # set axis positions (based on the axes) - self.index_axes = [ - index_axes_map[a].set_pos(j).update_info(self.info) - for j, a in enumerate(axes) - ] - j = len(self.index_axes) + # Because we are always 2D, there is only one new_index, so + # we know it will have pos=0 + new_index.set_pos(0) + new_index.update_info(new_info) + new_index.maybe_set_size(min_itemsize) # check for column conflicts - # check for column conflicts - for a in self.axes: - a.maybe_set_size(min_itemsize=min_itemsize) + new_index_axes = [new_index] + j = len(new_index_axes) # i.e. 1 + assert j == 1 # reindex by our non_index_axes & compute data_columns - for a in self.non_index_axes: + assert len(new_non_index_axes) == 1 + for a in new_non_index_axes: obj = _reindex_axis(obj, a[0], a[1]) def get_blk_items(mgr, blocks): return [mgr.items.take(blk.mgr_locs) for blk in blocks] + transposed = new_index.axis == 1 + # figure out data_columns and get out blocks - block_obj = self.get_object(obj)._consolidate() - blocks = block_obj._data.blocks - blk_items = get_blk_items(block_obj._data, blocks) - if len(self.non_index_axes): - axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns(data_columns, min_itemsize) - if len(data_columns): - mgr = block_obj.reindex( - Index(axis_labels).difference(Index(data_columns)), axis=axis - )._data - - blocks = list(mgr.blocks) - blk_items = get_blk_items(mgr, blocks) - for c in data_columns: - mgr = block_obj.reindex([c], axis=axis)._data - blocks.extend(mgr.blocks) - blk_items.extend(get_blk_items(mgr, mgr.blocks)) - - # reorder the blocks in the same order as the existing_table if we can - if existing_table is not None: - by_items = { - tuple(b_items.tolist()): (b, b_items) - for b, b_items in zip(blocks, blk_items) - } - new_blocks = [] - new_blk_items = [] - for ea in existing_table.values_axes: - items = tuple(ea.values) - try: - b, b_items = by_items.pop(items) - new_blocks.append(b) - new_blk_items.append(b_items) - except (IndexError, KeyError): - raise ValueError( - "cannot match existing table structure for [{items}] " - "on appending data".format( - items=(",".join(pprint_thing(item) for item in items)) - ) - ) - blocks = new_blocks - blk_items = new_blk_items + data_columns = self.validate_data_columns( + data_columns, min_itemsize, new_non_index_axes + ) + + block_obj = self.get_object(obj, transposed)._consolidate() + + blocks, blk_items = self._get_blocks_and_items( + block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns + ) # add my values - self.values_axes = [] + vaxes = [] for i, (b, b_items) in enumerate(zip(blocks, blk_items)): # shape of the data column are the indexable axes @@ -3826,60 +3781,144 @@ def get_blk_items(mgr, blocks): if data_columns and len(b_items) == 1 and b_items[0] in data_columns: klass = DataIndexableCol name = b_items[0] - self.data_columns.append(name) + if not (name is None or isinstance(name, str)): + # TODO: should the message here be more specifically non-str? + raise ValueError("cannot have non-object label DataIndexableCol") # make sure that we match up the existing columns # if we have an existing table - if existing_table is not None and validate: + existing_col: Optional[DataCol] + + if table_exists and validate: try: - existing_col = existing_table.values_axes[i] + existing_col = self.values_axes[i] except (IndexError, KeyError): raise ValueError( - "Incompatible appended table [{blocks}]" - "with existing table [{table}]".format( - blocks=blocks, table=existing_table.values_axes - ) + f"Incompatible appended table [{blocks}]" + f"with existing table [{self.values_axes}]" ) else: existing_col = None - try: - col = klass.create_for_block(i=i, name=name, version=self.version) - col.set_atom( - block=b, - block_items=b_items, - existing_col=existing_col, - min_itemsize=min_itemsize, - nan_rep=nan_rep, - encoding=self.encoding, - errors=self.errors, - info=self.info, - ) - col.set_pos(j) + new_name = name or f"values_block_{i}" + data_converted = _maybe_convert_for_string_atom( + new_name, + b, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + ) + adj_name = _maybe_adjust_name(new_name, self.version) + + typ = klass._get_atom(data_converted) + kind = _dtype_to_kind(data_converted.dtype.name) + tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None + + meta = metadata = ordered = None + if is_categorical_dtype(data_converted): + ordered = data_converted.ordered + meta = "category" + metadata = np.array(data_converted.categories, copy=False).ravel() + + data, dtype_name = _get_data_and_dtype_name(data_converted) + + col = klass( + name=adj_name, + cname=new_name, + values=list(b_items), + typ=typ, + pos=j, + kind=kind, + tz=tz, + ordered=ordered, + meta=meta, + metadata=metadata, + dtype=dtype_name, + data=data, + ) + col.update_info(new_info) + + vaxes.append(col) - self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError) as e: - raise e - except Exception as detail: - raise Exception( - "cannot find the correct atom type -> " - "[dtype->{name},items->{items}] {detail!s}".format( - name=b.dtype.name, items=b_items, detail=detail - ) - ) j += 1 - # validate our min_itemsize - self.validate_min_itemsize(min_itemsize) + dcs = [col.name for col in vaxes if col.is_data_indexable] + + new_table = type(self)( + parent=self.parent, + group=self.group, + encoding=self.encoding, + errors=self.errors, + index_axes=new_index_axes, + non_index_axes=new_non_index_axes, + values_axes=vaxes, + data_columns=dcs, + info=new_info, + nan_rep=nan_rep, + ) + if hasattr(self, "levels"): + # TODO: get this into constructor, only for appropriate subclass + new_table.levels = self.levels + + new_table.validate_min_itemsize(min_itemsize) + + if validate and table_exists: + new_table.validate(self) + + return new_table + + @staticmethod + def _get_blocks_and_items( + block_obj, table_exists, new_non_index_axes, values_axes, data_columns + ): + # Helper to clarify non-state-altering parts of _create_axes + + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.mgr_locs) for blk in blocks] + + blocks = block_obj._data.blocks + blk_items = get_blk_items(block_obj._data, blocks) + + if len(data_columns): + axis, axis_labels = new_non_index_axes[0] + new_labels = Index(axis_labels).difference(Index(data_columns)) + mgr = block_obj.reindex(new_labels, axis=axis)._data - # validate our metadata - self.validate_metadata(existing_table) + blocks = list(mgr.blocks) + blk_items = get_blk_items(mgr, blocks) + for c in data_columns: + mgr = block_obj.reindex([c], axis=axis)._data + blocks.extend(mgr.blocks) + blk_items.extend(get_blk_items(mgr, mgr.blocks)) + + # reorder the blocks in the same order as the existing table if we can + if table_exists: + by_items = { + tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items) + } + new_blocks = [] + new_blk_items = [] + for ea in values_axes: + items = tuple(ea.values) + try: + b, b_items = by_items.pop(items) + new_blocks.append(b) + new_blk_items.append(b_items) + except (IndexError, KeyError): + jitems = ",".join(pprint_thing(item) for item in items) + raise ValueError( + f"cannot match existing table structure for [{jitems}] " + "on appending data" + ) + blocks = new_blocks + blk_items = new_blk_items - # validate the axes if we have an existing table - if validate: - self.validate(existing_table) + return blocks, blk_items - def process_axes(self, obj, columns=None): + def process_axes(self, obj, selection: "Selection", columns=None): """ process axes filters """ # make a copy to avoid side effects @@ -3888,6 +3927,7 @@ def process_axes(self, obj, columns=None): # make sure to include levels if we have them if columns is not None and self.is_multi_index: + assert isinstance(self.levels, list) # assured by is_multi_index for n in self.levels: if n not in columns: columns.insert(0, n) @@ -3897,8 +3937,8 @@ def process_axes(self, obj, columns=None): obj = _reindex_axis(obj, axis, labels, columns) # apply the selection filters (but keep in the same order) - if self.selection.filter is not None: - for field, op, filt in self.selection.filter.format(): + if selection.filter is not None: + for field, op, filt in selection.filter.format(): def process_filter(field, filt): @@ -3931,18 +3971,19 @@ def process_filter(field, filt): takers = op(values, filt) return obj.loc(axis=axis_number)[takers] - raise ValueError( - "cannot find the field [{field}] for " - "filtering!".format(field=field) - ) + raise ValueError(f"cannot find the field [{field}] for filtering!") obj = process_filter(field, filt) return obj def create_description( - self, complib=None, complevel=None, fletcher32=False, expectedrows=None - ): + self, + complib, + complevel: Optional[int], + fletcher32: bool, + expectedrows: Optional[int], + ) -> Dict[str, Any]: """ create the description of the table from the axes & values """ # provided expected rows if its passed @@ -3968,7 +4009,9 @@ def create_description( return d - def read_coordinates(self, where=None, start=None, stop=None, **kwargs): + def read_coordinates( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + ): """select coordinates (row numbers) from a table; return the coordinates object """ @@ -3981,10 +4024,10 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) - coords = self.selection.select_coords() - if self.selection.filter is not None: - for field, op, filt in self.selection.filter.format(): + selection = Selection(self, where=where, start=start, stop=stop) + coords = selection.select_coords() + if selection.filter is not None: + for field, op, filt in selection.filter.format(): data = self.read_column( field, start=coords.min(), stop=coords.max() + 1 ) @@ -3992,7 +4035,13 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return Index(coords) - def read_column(self, column, where=None, start=None, stop=None): + def read_column( + self, + column: str, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """return a single column from the table, generally only indexables are interesting """ @@ -4013,28 +4062,22 @@ def read_column(self, column, where=None, start=None, stop=None): if not a.is_data_indexable: raise ValueError( - "column [{column}] can not be extracted individually; " - "it is not data indexable".format(column=column) + f"column [{column}] can not be extracted individually; " + "it is not data indexable" ) # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series( - _set_tz( - a.convert( - c[start:stop], - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors, - ).take_data(), - a.tz, - True, - ), - name=column, + col_values = a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, ) + return Series(_set_tz(col_values[1], a.tz), name=column) - raise KeyError("column [{column}] not found in the table".format(column=column)) + raise KeyError(f"column [{column}] not found in the table") class WORMTable(Table): @@ -4045,7 +4088,13 @@ class WORMTable(Table): table_type = "worm" - def read(self, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): """ read the indices and the indexing array, calculate offset rows and return """ raise NotImplementedError("WORMTable needs to implement read") @@ -4055,41 +4104,12 @@ def write(self, **kwargs): to): write out the indices and the values using _write_array (e.g. a CArray) create an indexing table so that we can search """ - raise NotImplementedError("WORKTable needs to implement write") - - -class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a - (possibly) already existing appendable table this table ALLOWS - append (but doesn't require them), and stores the data in a format - that can be easily searched - - """ - - _indexables = [ - IndexCol(name="index", axis=1, pos=0), - IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), - DataCol(name="fields", cname="values", kind_attr="fields", pos=2), - ] # type: Optional[List[IndexCol]] - table_type = "legacy" - ndim = 3 - - def write(self, **kwargs): - raise TypeError("write operations are not allowed on legacy tables!") - - def read(self, where=None, columns=None, **kwargs): - """we have n indexable columns, with an arbitrary number of data - axes - """ - - if not self.read_axes(where=where, **kwargs): - return None + raise NotImplementedError("WORMTable needs to implement write") -class AppendableTable(LegacyTable): +class AppendableTable(Table): """ support the new appendable table formats """ - _indexables = None table_type = "appendable" def write( @@ -4104,24 +4124,30 @@ def write( chunksize=None, expectedrows=None, dropna=False, - **kwargs + nan_rep=None, + data_columns=None, ): if not append and self.is_exists: self._handle.remove_node(self.group, "table") # create the axes - self.create_axes( - axes=axes, obj=obj, validate=append, min_itemsize=min_itemsize, **kwargs + table = self._create_axes( + axes=axes, + obj=obj, + validate=append, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + data_columns=data_columns, ) - for a in self.axes: - a.validate(self, append) + for a in table.axes: + a.validate_names() - if not self.is_exists: + if not table.is_exists: # create the table - options = self.create_description( + options = table.create_description( complib=complib, complevel=complevel, fletcher32=fletcher32, @@ -4129,25 +4155,22 @@ def write( ) # set the table attributes - self.set_attrs() + table.set_attrs() # create the table - self._handle.create_table(self.group, **options) - else: - pass - # table = self.table + table._handle.create_table(table.group, **options) # update my info - self.set_info() + table.attrs.info = table.info # validate the axes and set the kinds - for a in self.axes: - a.validate_and_set(self, append) + for a in table.axes: + a.validate_and_set(table, append) # add the rows - self.write_data(chunksize, dropna=dropna) + table.write_data(chunksize, dropna=dropna) - def write_data(self, chunksize, dropna=False): + def write_data(self, chunksize: Optional[int], dropna: bool = False): """ we form the data into a 2-d including indexes,values,mask write chunk-by-chunk """ @@ -4178,21 +4201,7 @@ def write_data(self, chunksize, dropna=False): # broadcast the indexes if needed indexes = [a.cvalues for a in self.index_axes] nindexes = len(indexes) - bindexes = [] - for i, idx in enumerate(indexes): - - # broadcast to all other indexes except myself - if i > 0 and i < nindexes: - repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)]) - idx = np.tile(idx, repeater) - - if i < nindexes - 1: - repeater = np.prod( - [indexes[bi].shape[0] for bi in range(i + 1, nindexes)] - ) - idx = np.repeat(idx, repeater) - - bindexes.append(idx) + assert nindexes == 1, nindexes # ensures we dont need to broadcast # transpose the values so first dimension is last # reshape the values if needed @@ -4217,12 +4226,18 @@ def write_data(self, chunksize, dropna=False): self.write_data_chunk( rows, - indexes=[a[start_i:end_i] for a in bindexes], + indexes=[a[start_i:end_i] for a in indexes], mask=mask[start_i:end_i] if mask is not None else None, values=[v[start_i:end_i] for v in bvalues], ) - def write_data_chunk(self, rows, indexes, mask, values): + def write_data_chunk( + self, + rows: np.ndarray, + indexes: List[np.ndarray], + mask: Optional[np.ndarray], + values: List[np.ndarray], + ): """ Parameters ---------- @@ -4237,40 +4252,33 @@ def write_data_chunk(self, rows, indexes, mask, values): if not np.prod(v.shape): return - try: - nrows = indexes[0].shape[0] - if nrows != len(rows): - rows = np.empty(nrows, dtype=self.dtype) - names = self.dtype.names - nindexes = len(indexes) - - # indexes - for i, idx in enumerate(indexes): - rows[names[i]] = idx + nrows = indexes[0].shape[0] + if nrows != len(rows): + rows = np.empty(nrows, dtype=self.dtype) + names = self.dtype.names + nindexes = len(indexes) - # values - for i, v in enumerate(values): - rows[names[i + nindexes]] = v + # indexes + for i, idx in enumerate(indexes): + rows[names[i]] = idx - # mask - if mask is not None: - m = ~mask.ravel().astype(bool, copy=False) - if not m.all(): - rows = rows[m] + # values + for i, v in enumerate(values): + rows[names[i + nindexes]] = v - except Exception as detail: - raise Exception("cannot create row-data -> {detail}".format(detail=detail)) + # mask + if mask is not None: + m = ~mask.ravel().astype(bool, copy=False) + if not m.all(): + rows = rows[m] - try: - if len(rows): - self.table.append(rows) - self.table.flush() - except Exception as detail: - raise TypeError( - "tables cannot write this data -> {detail}".format(detail=detail) - ) + if len(rows): + self.table.append(rows) + self.table.flush() - def delete(self, where=None, start=None, stop=None, **kwargs): + def delete( + self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + ): # delete all rows (and return the nrows) if where is None or not len(where): @@ -4291,8 +4299,8 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection(self, where, start=start, stop=stop, **kwargs) - values = self.selection.select_coords() + selection = Selection(self, where, start=start, stop=stop) + values = selection.select_coords() # delete the rows in reverse order sorted_series = Series(values).sort_values() @@ -4337,48 +4345,71 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type = DataFrame # type: Type[Union[DataFrame, Series]] + obj_type: Type[Union[DataFrame, Series]] = DataFrame @property - def is_transposed(self): + def is_transposed(self) -> bool: return self.index_axes[0].axis == 1 - def get_object(self, obj): + @classmethod + def get_object(cls, obj, transposed: bool): """ these are written transposed """ - if self.is_transposed: + if transposed: obj = obj.T return obj - def read(self, where=None, columns=None, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): - if not self.read_axes(where=where, **kwargs): + # validate the version + self.validate_version(where) + + # infer the data kind + if not self.infer_axes(): return None + result = self._read_axes(where=where, start=start, stop=stop) + info = ( self.info.get(self.non_index_axes[0][0], dict()) if len(self.non_index_axes) else dict() ) - index = self.index_axes[0].values + + inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] + assert len(inds) == 1 + ind = inds[0] + + index = result[ind][0] + frames = [] - for a in self.values_axes: + for i, a in enumerate(self.axes): + if a not in self.values_axes: + continue + index_vals, cvalues = result[i] # we could have a multi-index constructor here # ensure_index doesn't recognized our list-of-tuples here if info.get("type") == "MultiIndex": - cols = MultiIndex.from_tuples(a.values) + cols = MultiIndex.from_tuples(index_vals) else: - cols = Index(a.values) + cols = Index(index_vals) + names = info.get("names") if names is not None: cols.set_names(names, inplace=True) if self.is_transposed: - values = a.cvalues + values = cvalues index_ = cols cols_ = Index(index, name=getattr(index, "name", None)) else: - values = a.cvalues.T + values = cvalues.T index_ = Index(index, name=getattr(index, "name", None)) cols_ = cols @@ -4386,17 +4417,24 @@ def read(self, where=None, columns=None, **kwargs): if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) - block = make_block(values, placement=np.arange(len(cols_)), ndim=2) - mgr = BlockManager([block], [cols_, index_]) - frames.append(DataFrame(mgr)) + if isinstance(values, np.ndarray): + df = DataFrame(values.T, columns=cols_, index=index_) + elif isinstance(values, Index): + df = DataFrame(values, columns=cols_, index=index_) + else: + # Categorical + df = DataFrame([values], columns=cols_, index=index_) + assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) + frames.append(df) if len(frames) == 1: df = frames[0] else: df = concat(frames, axis=1) + selection = Selection(self, where=where, start=start, stop=stop) # apply the selection filters & axis orderings - df = self.process_axes(df, columns=columns) + df = self.process_axes(df, selection=selection, columns=columns) return df @@ -4408,31 +4446,37 @@ class AppendableSeriesTable(AppendableFrameTable): table_type = "appendable_series" ndim = 2 obj_type = Series - storage_obj_type = DataFrame @property - def is_transposed(self): + def is_transposed(self) -> bool: return False - def get_object(self, obj): + @classmethod + def get_object(cls, obj, transposed: bool): return obj def write(self, obj, data_columns=None, **kwargs): """ we are going to write this as a frame table """ if not isinstance(obj, DataFrame): name = obj.name or "values" - obj = DataFrame({name: obj}, index=obj.index) - obj.columns = [name] + obj = obj.to_frame(name) return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) - def read(self, columns=None, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ) -> Series: is_multi_index = self.is_multi_index if columns is not None and is_multi_index: + assert isinstance(self.levels, list) # needed for mypy for n in self.levels: if n not in columns: columns.insert(0, n) - s = super().read(columns=columns, **kwargs) + s = super().read(where=where, columns=columns, start=start, stop=stop) if is_multi_index: s.set_index(self.levels, inplace=True) @@ -4469,7 +4513,7 @@ class GenericTable(AppendableFrameTable): obj_type = DataFrame @property - def pandas_type(self): + def pandas_type(self) -> str: return self.pandas_kind @property @@ -4482,30 +4526,44 @@ def get_attrs(self): self.nan_rep = None self.levels = [] - self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] - self.values_axes = [ - a.infer(self) for a in self.indexables if not a.is_an_indexable - ] + self.index_axes = [a for a in self.indexables if a.is_an_indexable] + self.values_axes = [a for a in self.indexables if not a.is_an_indexable] self.data_columns = [a.name for a in self.values_axes] - @property + @cache_readonly def indexables(self): """ create the indexables from the table description """ - if self._indexables is None: - - d = self.description - - # the index columns is just a simple index - self._indexables = [GenericIndexCol(name="index", axis=0)] - - for i, n in enumerate(d._v_names): + d = self.description + + # TODO: can we get a typ for this? AFAICT it is the only place + # where we aren't passing one + # the index columns is just a simple index + md = self.read_metadata("index") + meta = "category" if md is not None else None + index_col = GenericIndexCol( + name="index", axis=0, table=self.table, meta=meta, metadata=md + ) - dc = GenericDataIndexableCol( - name=n, pos=i, values=[n], version=self.version - ) - self._indexables.append(dc) + _indexables = [index_col] + + for i, n in enumerate(d._v_names): + assert isinstance(n, str) + + atom = getattr(d, n) + md = self.read_metadata(n) + meta = "category" if md is not None else None + dc = GenericDataIndexableCol( + name=n, + pos=i, + values=[n], + typ=atom, + table=self.table, + meta=meta, + metadata=md, + ) + _indexables.append(dc) - return self._indexables + return _indexables def write(self, **kwargs): raise NotImplementedError("cannot write on an generic table") @@ -4520,7 +4578,7 @@ class AppendableMultiFrameTable(AppendableFrameTable): _re_levels = re.compile(r"^level_\d+$") @property - def table_type_short(self): + def table_type_short(self) -> str: return "appendable_multi" def write(self, obj, data_columns=None, **kwargs): @@ -4534,9 +4592,15 @@ def write(self, obj, data_columns=None, **kwargs): data_columns.insert(0, n) return super().write(obj=obj, data_columns=data_columns, **kwargs) - def read(self, **kwargs): + def read( + self, + where=None, + columns=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): - df = super().read(**kwargs) + df = super().read(where=where, columns=columns, start=start, stop=stop) df = df.set_index(self.levels) # remove names for 'level_%d' @@ -4547,7 +4611,7 @@ def read(self, **kwargs): return df -def _reindex_axis(obj, axis, labels, other=None): +def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame: ax = obj._get_axis(axis) labels = ensure_index(labels) @@ -4562,182 +4626,122 @@ def _reindex_axis(obj, axis, labels, other=None): if other is not None: labels = ensure_index(other.unique()).intersection(labels, sort=False) if not labels.equals(ax): - slicer = [slice(None, None)] * obj.ndim + slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim slicer[axis] = labels obj = obj.loc[tuple(slicer)] return obj -def _get_info(info, name): - """ get/create the info for this name """ - try: - idx = info[name] - except KeyError: - idx = info[name] = dict() - return idx - - # tz to/from coercion -def _get_tz(tz): +def _get_tz(tz: tzinfo) -> Union[str, tzinfo]: """ for a tz-aware type, return an encoded zone """ zone = timezones.get_timezone(tz) - if zone is None: - zone = tz.utcoffset().total_seconds() return zone -def _set_tz(values, tz, preserve_UTC=False, coerce=False): +def _set_tz( + values: Union[np.ndarray, Index], + tz: Optional[Union[str, tzinfo]], + coerce: bool = False, +) -> Union[np.ndarray, DatetimeIndex]: """ coerce the values to a DatetimeIndex if tz is set preserve the input shape if possible Parameters ---------- - values : ndarray - tz : string/pickled tz object - preserve_UTC : boolean, - preserve the UTC of the result + values : ndarray or Index + tz : str or tzinfo coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ + if isinstance(values, DatetimeIndex): + # If values is tzaware, the tz gets dropped in the values.ravel() + # call below (which returns an ndarray). So we are only non-lossy + # if `tz` matches `values.tz`. + assert values.tz is None or values.tz == tz + if tz is not None: name = getattr(values, "name", None) values = values.ravel() tz = timezones.get_timezone(_ensure_decoded(tz)) values = DatetimeIndex(values, name=name) - if values.tz is None: - values = values.tz_localize("UTC").tz_convert(tz) - if preserve_UTC: - if tz == "UTC": - values = list(values) + values = values.tz_localize("UTC").tz_convert(tz) elif coerce: values = np.asarray(values, dtype="M8[ns]") return values -def _convert_index(index, encoding=None, errors="strict", format_type=None): - index_name = getattr(index, "name", None) +def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: + assert isinstance(name, str) - if isinstance(index, DatetimeIndex): - converted = index.asi8 + index_name = index.name + converted, dtype_name = _get_data_and_dtype_name(index) + kind = _dtype_to_kind(dtype_name) + atom = DataIndexableCol._get_atom(converted) + + if isinstance(index, Int64Index): + # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, + # in which case "kind" is "integer", "integer", "datetime64", + # "timedelta64", and "integer", respectively. return IndexCol( - converted, - "datetime64", - _tables().Int64Col(), + name, + values=converted, + kind=kind, + typ=atom, freq=getattr(index, "freq", None), tz=getattr(index, "tz", None), index_name=index_name, ) - elif isinstance(index, TimedeltaIndex): - converted = index.asi8 - return IndexCol( - converted, - "timedelta64", - _tables().Int64Col(), - freq=getattr(index, "freq", None), - index_name=index_name, - ) - elif isinstance(index, (Int64Index, PeriodIndex)): - atom = _tables().Int64Col() - # avoid to store ndarray of Period objects - return IndexCol( - index._ndarray_values, - "integer", - atom, - freq=getattr(index, "freq", None), - index_name=index_name, - ) if isinstance(index, MultiIndex): raise TypeError("MultiIndex not supported here!") inferred_type = lib.infer_dtype(index, skipna=False) + # we wont get inferred_type of "datetime64" or "timedelta64" as these + # would go through the DatetimeIndex/TimedeltaIndex paths above values = np.asarray(index) - if inferred_type == "datetime64": - converted = values.view("i8") - return IndexCol( - converted, - "datetime64", - _tables().Int64Col(), - freq=getattr(index, "freq", None), - tz=getattr(index, "tz", None), - index_name=index_name, - ) - elif inferred_type == "timedelta64": - converted = values.view("i8") - return IndexCol( - converted, - "timedelta64", - _tables().Int64Col(), - freq=getattr(index, "freq", None), - index_name=index_name, - ) - elif inferred_type == "datetime": - converted = np.asarray( - [(time.mktime(v.timetuple()) + v.microsecond / 1e6) for v in values], - dtype=np.float64, - ) + if inferred_type == "date": + converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) return IndexCol( - converted, "datetime", _tables().Time64Col(), index_name=index_name + name, converted, "date", _tables().Time32Col(), index_name=index_name, ) - elif inferred_type == "date": - converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) - return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name) elif inferred_type == "string": - # atom = _tables().ObjectAtom() - # return np.asarray(values, dtype='O'), 'object', atom converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( + name, converted, "string", _tables().StringCol(itemsize), - itemsize=itemsize, index_name=index_name, ) - elif inferred_type == "unicode": - if format_type == "fixed": - atom = _tables().ObjectAtom() - return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name - ) - raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats".format( - format_type - ) - ) - elif inferred_type == "integer": - # take a guess for now, hope the values fit - atom = _tables().Int64Col() - return IndexCol( - np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name - ) - elif inferred_type == "floating": - atom = _tables().Float64Col() + elif inferred_type in ["integer", "floating"]: return IndexCol( - np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name + name, values=converted, kind=kind, typ=atom, index_name=index_name, ) - else: # pragma: no cover + else: + assert isinstance(converted, np.ndarray) and converted.dtype == object + assert kind == "object", kind atom = _tables().ObjectAtom() - return IndexCol( - np.asarray(values, dtype="O"), "object", atom, index_name=index_name - ) + return IndexCol(name, converted, kind, atom, index_name=index_name,) -def _unconvert_index(data, kind, encoding=None, errors="strict"): - kind = _ensure_decoded(kind) +def _unconvert_index( + data, kind: str, encoding: str, errors: str +) -> Union[np.ndarray, Index]: + index: Union[Index, np.ndarray] + if kind == "datetime64": index = DatetimeIndex(data) elif kind == "timedelta64": index = TimedeltaIndex(data) - elif kind == "datetime": - index = np.asarray([datetime.fromtimestamp(v) for v in data], dtype=object) elif kind == "date": try: index = np.asarray([date.fromordinal(v) for v in data], dtype=object) @@ -4752,83 +4756,132 @@ def _unconvert_index(data, kind, encoding=None, errors="strict"): elif kind == "object": index = np.asarray(data[0]) else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) + raise ValueError(f"unrecognized index type {kind}") return index -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"): - kind = _ensure_decoded(kind) - if kind == "datetime": - index = to_datetime(data) - elif kind in ("integer"): - index = np.asarray(data, dtype=object) - elif kind in ("string"): - index = _unconvert_string_array( - data, nan_rep=None, encoding=encoding, errors=errors +def _maybe_convert_for_string_atom( + name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors +): + + if not block.is_object: + return block.values + + dtype_name = block.dtype.name + inferred_type = lib.infer_dtype(block.values, skipna=False) + + if inferred_type == "date": + raise TypeError("[date] is not implemented as a table column") + elif inferred_type == "datetime": + # after GH#8260 + # this only would be hit for a multi-timezone dtype which is an error + raise TypeError( + "too many timezones in this block, create separate data columns" ) - else: # pragma: no cover - raise ValueError("unrecognized index type {kind}".format(kind=kind)) - return index + elif not (inferred_type == "string" or dtype_name == "object"): + return block.values + + block = block.fillna(nan_rep, downcast=False) + if isinstance(block, list): + # Note: because block is always object dtype, fillna goes + # through a path such that the result is always a 1-element list + block = block[0] + data = block.values + + # see if we have a valid string type + inferred_type = lib.infer_dtype(data.ravel(), skipna=False) + if inferred_type != "string": + + # we cannot serialize this data, so report an exception on a column + # by column basis + for i in range(len(block.shape[0])): + + col = block.iget(i) + inferred_type = lib.infer_dtype(col.ravel(), skipna=False) + if inferred_type != "string": + iloc = block.mgr_locs.indexer[i] + raise TypeError( + f"Cannot serialize the column [{iloc}] because\n" + f"its data contents are [{inferred_type}] object dtype" + ) + + # itemsize is the maximum length of a string (along any dimension) + data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) + assert data_converted.shape == block.shape, (data_converted.shape, block.shape) + itemsize = data_converted.itemsize -def _convert_string_array(data, encoding, errors, itemsize=None): + # specified min_itemsize? + if isinstance(min_itemsize, dict): + min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) + itemsize = max(min_itemsize or 0, itemsize) + + # check for column in the values conflicts + if existing_col is not None: + eci = existing_col.validate_col(itemsize) + if eci > itemsize: + itemsize = eci + + data_converted = data_converted.astype(f"|S{itemsize}", copy=False) + return data_converted + + +def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: """ - we take a string-like that is object dtype and coerce to a fixed size - string type + Take a string-like that is object dtype and coerce to a fixed size string type. Parameters ---------- - data : a numpy array of object dtype - encoding : None or string-encoding - errors : handler for encoding errors - itemsize : integer, optional, defaults to the max length of the strings + data : np.ndarray[object] + encoding : str + errors : str + Handler for encoding errors. Returns ------- - data in a fixed-length string dtype, encoded to bytes if needed + np.ndarray[fixed-length-string] """ # encode if needed - if encoding is not None and len(data): + if len(data): data = ( Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) ) # create the sized dtype - if itemsize is None: - ensured = ensure_object(data.ravel()) - itemsize = max(1, libwriters.max_len_string_array(ensured)) + ensured = ensure_object(data.ravel()) + itemsize = max(1, libwriters.max_len_string_array(ensured)) - data = np.asarray(data, dtype="S{size}".format(size=itemsize)) + data = np.asarray(data, dtype=f"S{itemsize}") return data -def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): +def _unconvert_string_array( + data: np.ndarray, nan_rep, encoding: str, errors: str +) -> np.ndarray: """ - inverse of _convert_string_array + Inverse of _convert_string_array. Parameters ---------- - data : fixed length string dtyped array - nan_rep : the storage repr of NaN, optional - encoding : the encoding of the data, optional - errors : handler for encoding errors, default 'strict' + data : np.ndarray[fixed-length-string] + nan_rep : the storage repr of NaN + encoding : str + errors : str + Handler for encoding errors. Returns ------- - an object array of the decoded data - + np.ndarray[object] + Decoded data. """ shape = data.shape data = np.asarray(data.ravel(), dtype=object) - # guard against a None encoding (because of a legacy - # where the passed encoding is actually None) - encoding = _ensure_encoding(encoding) - if encoding is not None and len(data): + if len(data): itemsize = libwriters.max_len_string_array(ensure_object(data)) - dtype = "U{0}".format(itemsize) + dtype = f"U{itemsize}" if isinstance(data[0], bytes): data = Series(data).str.decode(encoding, errors=errors).values @@ -4842,33 +4895,111 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): return data.reshape(shape) -def _maybe_convert(values, val_kind, encoding, errors): +def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): + assert isinstance(val_kind, str), type(val_kind) if _need_convert(val_kind): conv = _get_converter(val_kind, encoding, errors) - # conv = np.frompyfunc(conv, 1, 1) values = conv(values) return values -def _get_converter(kind, encoding, errors): - kind = _ensure_decoded(kind) +def _get_converter(kind: str, encoding: str, errors: str): if kind == "datetime64": return lambda x: np.asarray(x, dtype="M8[ns]") - elif kind == "datetime": - return lambda x: to_datetime(x, cache=True).to_pydatetime() elif kind == "string": - return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) + return lambda x: _unconvert_string_array( + x, nan_rep=None, encoding=encoding, errors=errors + ) else: # pragma: no cover - raise ValueError("invalid kind {kind}".format(kind=kind)) + raise ValueError(f"invalid kind {kind}") -def _need_convert(kind): - kind = _ensure_decoded(kind) - if kind in ("datetime", "datetime64", "string"): +def _need_convert(kind: str) -> bool: + if kind in ("datetime64", "string"): return True return False +def _maybe_adjust_name(name: str, version) -> str: + """ + Prior to 0.10.1, we named values blocks like: values_block_0 an the + name values_0, adjust the given name if necessary. + + Parameters + ---------- + name : str + version : Tuple[int, int, int] + + Returns + ------- + str + """ + try: + if version[0] == 0 and version[1] <= 10 and version[2] == 0: + m = re.search(r"values_block_(\d+)", name) + if m: + grp = m.groups()[0] + name = f"values_{grp}" + except IndexError: + pass + return name + + +def _dtype_to_kind(dtype_str: str) -> str: + """ + Find the "kind" string describing the given dtype name. + """ + dtype_str = _ensure_decoded(dtype_str) + + if dtype_str.startswith("string") or dtype_str.startswith("bytes"): + kind = "string" + elif dtype_str.startswith("float"): + kind = "float" + elif dtype_str.startswith("complex"): + kind = "complex" + elif dtype_str.startswith("int") or dtype_str.startswith("uint"): + kind = "integer" + elif dtype_str.startswith("datetime64"): + kind = "datetime64" + elif dtype_str.startswith("timedelta"): + kind = "timedelta64" + elif dtype_str.startswith("bool"): + kind = "bool" + elif dtype_str.startswith("category"): + kind = "category" + elif dtype_str.startswith("period"): + # We store the `freq` attr so we can restore from integers + kind = "integer" + elif dtype_str == "object": + kind = "object" + else: + raise ValueError(f"cannot interpret dtype of [{dtype_str}]") + + return kind + + +def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): + """ + Convert the passed data into a storable form and a dtype string. + """ + if is_categorical_dtype(data.dtype): + data = data.codes + + # For datetime64tz we need to drop the TZ in tests TODO: why? + dtype_name = data.dtype.name.split("[")[0] + + if data.dtype.kind in ["m", "M"]: + data = np.asarray(data.view("i8")) + # TODO: we used to reshape for the dt64tz case, but no longer + # doing that doesn't seem to break anything. why? + + elif isinstance(data, PeriodIndex): + data = data.asi8 + + data = np.asarray(data) + return data, dtype_name + + class Selection: """ Carries out a selection operation on a tables.Table object. @@ -4881,7 +5012,13 @@ class Selection: """ - def __init__(self, table, where=None, start=None, stop=None): + def __init__( + self, + table: Table, + where=None, + start: Optional[int] = None, + stop: Optional[int] = None, + ): self.table = table self.where = where self.start = start @@ -4932,20 +5069,19 @@ def generate(self, where): q = self.table.queryables() try: - return Expr(where, queryables=q, encoding=self.table.encoding) + return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) except NameError: # raise a nice message, suggesting that the user should use # data_columns + qkeys = ",".join(q.keys()) raise ValueError( - "The passed where expression: {0}\n" + f"The passed where expression: {where}\n" " contains an invalid variable reference\n" " all of the variable references must be a " "reference to\n" " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" - " The currently defined references are: {1}\n".format( - where, ",".join(q.keys()) - ) + f" The currently defined references are: {qkeys}\n" ) def select(self): diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 7e0a37e8cba20..976c319f89d47 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -2,9 +2,8 @@ from typing import IO, Any, Optional, Tuple from urllib.parse import urlparse as parse_url -from pandas.compat._optional import import_optional_dependency - from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency s3fs = import_optional_dependency( "s3fs", extra="The s3fs package is required to handle s3 files." diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py index fa6b29a1a3fcc..8f81352e6aecb 100644 --- a/pandas/io/sas/__init__.py +++ b/pandas/io/sas/__init__.py @@ -1 +1 @@ -from .sasreader import read_sas # noqa +from pandas.io.sas.sasreader import read_sas # noqa diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 6378198225516..bb5bce96bc64b 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -105,13 +105,11 @@ cdef const uint8_t[:] rle_decompress(int result_length, result[rpos] = 0x00 rpos += 1 else: - raise ValueError("unknown control byte: {byte}" - .format(byte=control_byte)) + raise ValueError(f"unknown control byte: {control_byte}") # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t if len(result) != result_length: - raise ValueError("RLE: {got} != {expect}".format(got=len(result), - expect=result_length)) + raise ValueError(f"RLE: {len(result)} != {result_length}") return np.asarray(result) @@ -194,8 +192,7 @@ cdef const uint8_t[:] rdc_decompress(int result_length, # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t if len(outbuff) != result_length: - raise ValueError("RDC: {got} != {expect}\n" - .format(got=len(outbuff), expect=result_length)) + raise ValueError(f"RDC: {len(outbuff)} != {result_length}\n") return np.asarray(outbuff) @@ -271,8 +268,7 @@ cdef class Parser: self.column_types[j] = column_type_string else: raise ValueError("unknown column type: " - "{typ}" - .format(typ=self.parser.columns[j].ctype)) + f"{self.parser.columns[j].ctype}") # compression if parser.compression == const.rle_compression: @@ -392,8 +388,7 @@ cdef class Parser: return True return False else: - raise ValueError("unknown page type: {typ}" - .format(typ=self.current_page_type)) + raise ValueError(f"unknown page type: {self.current_page_type}") cdef void process_byte_array_with_data(self, int offset, int length): diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index eb57d703cd4d5..f917477b81489 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,6 +13,7 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ +from collections import abc from datetime import datetime import struct @@ -22,7 +23,7 @@ import pandas as pd -from pandas.io.common import BaseIterator, get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer from pandas.io.sas._sas import Parser import pandas.io.sas.sas_constants as const @@ -36,7 +37,7 @@ class _column: # SAS7BDAT represents a SAS data file in SAS7BDAT format. -class SAS7BDATReader(BaseIterator): +class SAS7BDATReader(abc.Iterator): """ Read SAS files in SAS7BDAT format. @@ -169,7 +170,7 @@ def _get_properties(self): if buf in const.encoding_names: self.file_encoding = const.encoding_names[buf] else: - self.file_encoding = "unknown (code={name!s})".format(name=buf) + self.file_encoding = f"unknown (code={buf})" # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) @@ -293,8 +294,8 @@ def _read_bytes(self, offset, length): buf = self._path_or_buf.read(length) if len(buf) < length: self.close() - msg = "Unable to read {:d} bytes from file position {:d}." - raise ValueError(msg.format(length, offset)) + msg = f"Unable to read {length:d} bytes from file position {offset:d}." + raise ValueError(msg) return buf else: if offset + length > len(self._cached_page): @@ -457,12 +458,9 @@ def _process_columnsize_subheader(self, offset, length): self.column_count = self._read_int(offset, int_len) if self.col_count_p1 + self.col_count_p2 != self.column_count: print( - "Warning: column count mismatch ({p1} + {p2} != " - "{column_count})\n".format( - p1=self.col_count_p1, - p2=self.col_count_p2, - column_count=self.column_count, - ) + f"Warning: column count mismatch ({self.col_count_p1} + " + f"{self.col_count_p2} != " + f"{self.column_count})\n" ) # Unknown purpose @@ -672,8 +670,12 @@ def _read_next_page(self): return True elif len(self._cached_page) != self._page_length: self.close() - msg = "failed to read complete page from file (read {:d} of {:d} bytes)" - raise ValueError(msg.format(len(self._cached_page), self._page_length)) + msg = ( + "failed to read complete page from file (read " + f"{len(self._cached_page):d} of " + f"{self._page_length:d} bytes)" + ) + raise ValueError(msg) self._read_page_header() page_type = self._current_page_type @@ -725,8 +727,6 @@ def _chunk_to_dataframe(self): js += 1 else: self.close() - raise ValueError( - "unknown column type {type}".format(type=self._column_types[j]) - ) + raise ValueError(f"unknown column type {self._column_types[j]}") return rslt diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index ea26a9b8efdbf..3cf7fd885e564 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -7,7 +7,7 @@ https://support.sas.com/techsup/technote/ts140.pdf """ - +from collections import abc from datetime import datetime from io import BytesIO import struct @@ -19,7 +19,7 @@ import pandas as pd -from pandas.io.common import BaseIterator, get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer _correct_line1 = ( "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" @@ -143,7 +143,7 @@ """ -def _parse_date(datestr): +def _parse_date(datestr: str) -> datetime: """ Given a date in xport format, return Python date. """ try: # e.g. "16FEB11:10:07:55" @@ -152,11 +152,11 @@ def _parse_date(datestr): return pd.NaT -def _split_line(s, parts): +def _split_line(s: str, parts): """ Parameters ---------- - s: string + s: str Fixed-length string to split parts: list of (name, length) pairs Used to break up string, name '_' will be filtered from output. @@ -251,7 +251,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(BaseIterator): +class XportReader(abc.Iterator): __doc__ = _xport_reader_doc def __init__( @@ -367,8 +367,8 @@ def _read_header(self): fl = field["field_length"] if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): self.close() - msg = "Floating field width {0} is not between 2 and 8." - raise TypeError(msg.format(fl)) + msg = f"Floating field width {fl} is not between 2 and 8." + raise TypeError(msg) for k, v in field.items(): try: @@ -402,7 +402,7 @@ def _read_header(self): def __next__(self): return self.read(nrows=self._chunksize or 1) - def _record_count(self): + def _record_count(self) -> int: """ Get number of records in file. @@ -482,7 +482,7 @@ def read(self, nrows=None): df = pd.DataFrame(index=range(read_lines)) for j, x in enumerate(self.columns): - vec = data["s%d" % j] + vec = data["s" + str(j)] ntype = self.fields[j]["ntype"] if ntype == "numeric": vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 6bd3532d538c7..56ebb583bc2f9 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -1,7 +1,7 @@ """ Read SAS sas7bdat or xport files. """ -from pandas.io.common import _stringify_path +from pandas.io.common import stringify_path def read_sas( @@ -52,7 +52,7 @@ def read_sas( "than a string name, you must specify " "a format string" ) - filepath_or_buffer = _stringify_path(filepath_or_buffer) + filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 4f13349a819c3..cdbe14e9fe927 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -3,7 +3,8 @@ from pandas.compat._optional import import_optional_dependency -from pandas.api.types import is_list_like +from pandas.core.dtypes.inference import is_list_like + from pandas.core.api import DataFrame @@ -20,7 +21,7 @@ def read_spss( Parameters ---------- path : string or Path - File path + File path. usecols : list-like, optional Return a subset of the columns. If None, return all columns. convert_categoricals : bool, default is True diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 820aeaeb11649..f4527994db0d2 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -241,7 +241,7 @@ def read_sql_table( try: meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: - raise ValueError("Table {name} not found".format(name=table_name)) + raise ValueError(f"Table {table_name} not found") pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( @@ -256,7 +256,7 @@ def read_sql_table( if table is not None: return table else: - raise ValueError("Table {name} not found".format(name=table_name), con) + raise ValueError(f"Table {table_name} not found", con) def read_sql_query( @@ -277,14 +277,14 @@ def read_sql_query( Parameters ---------- - sql : string SQL query or SQLAlchemy Selectable (select or text object) + sql : str SQL query or SQLAlchemy Selectable (select or text object) SQL query to be executed. - con : SQLAlchemy connectable(engine/connection), database string URI, + con : SQLAlchemy connectable(engine/connection), database str URI, or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - index_col : string or list of strings, optional, default: None + index_col : str or list of strings, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : bool, default True Attempts to convert values of non-string, non-numeric objects (like @@ -294,7 +294,7 @@ def read_sql_query( to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. - Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. parse_dates : list or dict, default: None - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is @@ -355,16 +355,18 @@ def read_sql( Parameters ---------- - sql : string or SQLAlchemy Selectable (select or text object) + sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable (engine/connection) or database string URI - or DBAPI2 connection (fallback mode) + con : SQLAlchemy connectable (engine/connection) or database str URI + or DBAPI2 connection (fallback mode)' Using SQLAlchemy makes it possible to use any DB supported by that - library. If a DBAPI2 object, only sqlite3 is supported. - index_col : string or list of strings, optional, default: None + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy connectable. See + `here `_ + index_col : str or list of strings, optional, default: None Column(s) to set as index(MultiIndex). - coerce_float : boolean, default True + coerce_float : bool, default True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional, default: None @@ -372,7 +374,7 @@ def read_sql( to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. - Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}. parse_dates : list or dict, default: None - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is @@ -496,7 +498,7 @@ def to_sql( .. versionadded:: 0.24.0 """ if if_exists not in ("fail", "replace", "append"): - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + raise ValueError(f"'{if_exists}' is not valid for if_exists") pandas_sql = pandasSQL_builder(con, schema=schema) @@ -623,7 +625,7 @@ def __init__( self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError("Could not init table '{name}'".format(name=name)) + raise ValueError(f"Could not init table '{name}'") def exists(self): return self.pd_sql.has_table(self.name, self.schema) @@ -641,18 +643,14 @@ def _execute_create(self): def create(self): if self.exists(): if self.if_exists == "fail": - raise ValueError( - "Table '{name}' already exists.".format(name=self.name) - ) + raise ValueError(f"Table '{self.name}' already exists.") elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass else: - raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists) - ) + raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: self._execute_create() @@ -687,7 +685,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError("duplicate name in index/columns: {0}".format(err)) + raise ValueError(f"duplicate name in index/columns: {err}") else: temp = self.frame @@ -730,7 +728,7 @@ def insert(self, chunksize=None, method=None): elif callable(method): exec_insert = partial(method, self) else: - raise ValueError("Invalid parameter `method`: {}".format(method)) + raise ValueError(f"Invalid parameter `method`: {method}") keys, data_list = self.insert_data() @@ -784,7 +782,8 @@ def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None cols = [self.table.c[n] for n in columns] if self.index is not None: - [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] + for idx in self.index[::-1]: + cols.insert(0, self.table.c[idx]) sql_select = select(cols) else: sql_select = self.table.select() @@ -824,7 +823,7 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels) + f"levels, which is {nlevels}" ) else: return index_label @@ -837,7 +836,7 @@ def _index_name(self, index, index_label): return ["index"] else: return [ - l if l is not None else "level_{0}".format(i) + l if l is not None else f"level_{i}" for i, l in enumerate(self.frame.index.names) ] @@ -1302,10 +1301,7 @@ def to_sql( for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError( - "The type of {column} is not a " - "SQLAlchemy type ".format(column=col) - ) + raise ValueError(f"The type of {col} is not a SQLAlchemy type") table = SQLTable( name, @@ -1329,11 +1325,11 @@ def to_sql( ) if name not in table_names: msg = ( - "The provided table name '{0}' is not found exactly as " + f"The provided table name '{name}' is not found exactly as " "such in the database after writing the table, possibly " "due to case sensitivity issues. Consider using lower " "case table names." - ).format(name) + ) warnings.warn(msg, UserWarning) @property @@ -1393,14 +1389,12 @@ def _get_unicode_name(name): try: uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: - raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name) - ) + raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") return uname def _get_valid_sqlite_name(name): - # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ + # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python # Ensure the string can be encoded as UTF-8. # Ensure the string does not include any NUL characters. @@ -1454,13 +1448,14 @@ def insert_statement(self): escape = _get_valid_sqlite_name if self.index is not None: - [names.insert(0, idx) for idx in self.index[::-1]] + for idx in self.index[::-1]: + names.insert(0, idx) bracketed_names = [escape(column) for column in names] col_names = ",".join(bracketed_names) wildcards = ",".join([wld] * len(names)) - insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( - table=escape(self.name), columns=col_names, wld=wildcards + insert_statement = ( + f"INSERT INTO {escape(self.name)} ({col_names}) VALUES ({wildcards})" ) return insert_statement @@ -1494,9 +1489,7 @@ def _create_table_setup(self): keys = self.keys cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( - "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br - ) + f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" ) create_stmts = [ @@ -1597,14 +1590,11 @@ def execute(self, *args, **kwargs): self.con.rollback() except Exception as inner_exc: # pragma: no cover ex = DatabaseError( - "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc) + f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback" ) raise ex from inner_exc - ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) - ) + ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}") raise ex from exc @staticmethod @@ -1729,11 +1719,7 @@ def to_sql( if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError( - "{column} ({type!s}) not a string".format( - column=col, type=my_type - ) - ) + raise ValueError(f"{col} ({my_type}) not a string") table = SQLiteTable( name, @@ -1753,9 +1739,7 @@ def has_table(self, name, schema=None): # esc_name = escape(name) wld = "?" - query = ( - "SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" - ).format(wld=wld) + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" return len(self.execute(query, [name]).fetchall()) > 0 @@ -1763,7 +1747,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) + drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 07475f224bd5f..b216ee80c3940 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,13 +9,13 @@ You can find more information on http://presbrey.mit.edu/PyDTA and http://www.statsmodels.org/devel/ """ - -from collections import OrderedDict +from collections import abc import datetime from io import BytesIO import os import struct import sys +from typing import Any import warnings from dateutil.relativedelta import relativedelta @@ -23,7 +23,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas.util._decorators import Appender, deprecate_kwarg +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( ensure_object, @@ -44,7 +44,7 @@ from pandas.core.frame import DataFrame from pandas.core.series import Series -from pandas.io.common import BaseIterator, _stringify_path, get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, stringify_path _version_error = ( "Version of given Stata file is not 104, 105, 108, " @@ -58,10 +58,6 @@ convert_categoricals : bool, default True Read value labels and convert columns to Categorical/Factor variables.""" -_encoding_params = """\ -encoding : str, None or encoding - Encoding used to parse the files. None defaults to latin-1.""" - _statafile_processing_params2 = """\ index_col : str, optional Column to set as index. @@ -89,7 +85,7 @@ iterator : bool, default False Return StataReader object.""" -_read_stata_doc = """ +_read_stata_doc = f""" Read Stata file into DataFrame. Parameters @@ -104,11 +100,10 @@ By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. -%s -%s -%s -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} Returns ------- @@ -130,53 +125,24 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % ( - _statafile_processing_params1, - _encoding_params, - _statafile_processing_params2, - _chunksize_params, - _iterator_params, -) - -_data_method_doc = """ -Read observations from Stata file, converting them into a dataframe - -.. deprecated:: - This is a legacy method. Use `read` in new code. - -Parameters ----------- -%s -%s - -Returns -------- -DataFrame -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, -) +""" -_read_method_doc = """\ +_read_method_doc = f"""\ Reads observations from Stata file, converting them into a dataframe Parameters ---------- nrows : int Number of lines to read from data file, if None read whole file. -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} Returns ------- DataFrame -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, -) - +""" -_stata_reader_doc = """\ +_stata_reader_doc = f"""\ Class for reading Stata dta files. Parameters @@ -186,26 +152,17 @@ implementing a binary read() functions. .. versionadded:: 0.23.0 support for pathlib, py.path. -%s -%s -%s -%s -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _encoding_params, - _chunksize_params, -) +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +""" @Appender(_read_stata_doc) -@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) -@deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, - encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, @@ -400,7 +357,7 @@ def convert_delta_safe(base, deltas, unit): month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) else: - raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt)) + raise ValueError(f"Date fmt {fmt} not understood") if has_bad_values: # Restore NaT for bad values conv_dates[bad_locs] = NaT @@ -495,9 +452,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt) - ) + raise ValueError(f"Format {fmt} is not a known Stata date format") conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack("= 2 ** 53: - ws = precision_loss_doc % ("uint64", "float64") + ws = precision_loss_doc.format("uint64", "float64") data[col] = data[col].astype(dtype) @@ -614,26 +569,22 @@ def _cast_to_stata_types(data): data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) - if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: - ws = precision_loss_doc % ("int64", "float64") + if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): + ws = precision_loss_doc.format("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): raise ValueError( - "Column {col} has a maximum value of " - "infinity which is outside the range " - "supported by Stata.".format(col=col) + f"Column {col} has a maximum value of infinity which is outside " + "the range supported by Stata." ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: raise ValueError( - "Column {col} has a maximum value " - "({val}) outside the range supported by " - "Stata ({float64_max})".format( - col=col, val=value, float64_max=float64_max - ) + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" ) if ws: @@ -648,26 +599,18 @@ class StataValueLabel: Parameters ---------- - value : int8, int16, int32, float32 or float64 - The Stata missing value code - - Attributes - ---------- - string : string - String representation of the Stata missing value - value : int8, int16, int32, float32 or float64 - The original encoded missing value - - Methods - ------- - generate_value_label - + catarray : Categorical + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. """ - def __init__(self, catarray): + def __init__(self, catarray, encoding="latin-1"): + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") self.labname = catarray.name - + self._encoding = encoding categories = catarray.cat.categories self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) @@ -686,7 +629,7 @@ def __init__(self, catarray): value_label_mismatch_doc.format(catarray.name), ValueLabelTypeMismatch, ) - + category = category.encode(encoding) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding self.val.append(vl[0]) @@ -713,31 +656,31 @@ def _encode(self, s): """ return s.encode(self._encoding) - def generate_value_label(self, byteorder, encoding): + def generate_value_label(self, byteorder): """ + Generate the binary representation of the value labals. + Parameters ---------- byteorder : str Byte order of the output - encoding : str - File encoding Returns ------- value_label : bytes Bytes containing the formatted value label """ - - self._encoding = encoding + encoding = self._encoding bio = BytesIO() - null_string = "\x00" null_byte = b"\x00" # len bio.write(struct.pack(byteorder + "i", self.len)) # labname - labname = self._encode(_pad_bytes(self.labname[:32], 33)) + labname = self.labname[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) bio.write(labname) # padding - 3 bytes @@ -761,7 +704,7 @@ def generate_value_label(self, byteorder, encoding): # txt - Text labels, null terminated for text in self.txt: - bio.write(self._encode(text + null_string)) + bio.write(text + null_byte) bio.seek(0) return bio.read() @@ -862,16 +805,15 @@ def __init__(self, value): lambda self: self._value, doc="The binary representation of the missing value." ) - def __str__(self): + def __str__(self) -> str: return self.string - def __repr__(self): - # not perfect :-/ - return "{cls}({obj})".format(cls=self.__class__, obj=self) + def __repr__(self) -> str: + return f"{type(self)}({self})" - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: return ( - isinstance(other, self.__class__) + isinstance(other, type(self)) and self.string == other.string and self.value == other.value ) @@ -1038,14 +980,28 @@ def __init__(self): "typedef", "typename", "virtual", + "_all", + "_N", + "_skip", + "_b", + "_pi", + "str#", + "in", + "_pred", + "strL", + "_coef", + "_rc", + "using", + "_cons", + "_se", + "with", + "_n", ) -class StataReader(StataParser, BaseIterator): +class StataReader(StataParser, abc.Iterator): __doc__ = _stata_reader_doc - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) - @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def __init__( self, path_or_buf, @@ -1056,7 +1012,6 @@ def __init__( preserve_dtypes=True, columns=None, order_categoricals=True, - encoding=None, chunksize=None, ): super().__init__() @@ -1085,7 +1040,7 @@ def __init__( self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) - path_or_buf = _stringify_path(path_or_buf) + path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) @@ -1226,7 +1181,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]".format(typ)) + raise ValueError(f"cannot convert stata types [{typ}]") typlist = [f(x) for x in raw_typlist] @@ -1236,7 +1191,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]".format(typ)) + raise ValueError(f"cannot convert stata dtype [{typ}]") dtyplist = [f(x) for x in raw_typlist] @@ -1364,19 +1319,13 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata types [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_types = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata types [{invalid_types}]") try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata dtypes [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_dtypes = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") if self.format_version > 108: self.varlist = [ @@ -1449,12 +1398,13 @@ def _decode(self, s): except UnicodeDecodeError: # GH 25960, fallback to handle incorrect format produced when 117 # files are converted to 118 files in Stata - msg = """ + encoding = self._encoding + msg = f""" One or more strings in the dta file could not be decoded using {encoding}, and so the fallback encoding of latin-1 is being used. This can happen when a file has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" - warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) + warnings.warn(msg, UnicodeWarning) return s.decode("latin-1") def _read_value_labels(self): @@ -1538,18 +1488,6 @@ def _read_strls(self): # Wrap v_o in a string to allow uint64 values as keys on 32bit OS self.GSO[str(v_o)] = va - # legacy - @Appender(_data_method_doc) - def data(self, **kwargs): - - warnings.warn("'data' is deprecated, use 'read' instead") - - if self._data_read: - raise Exception("Data has already been read.") - self._data_read = True - - return self.read(None, **kwargs) - def __next__(self): return self.read(nrows=self._chunksize or 1) @@ -1571,7 +1509,6 @@ def get_chunk(self, size=None): return self.read(nrows=size) @Appender(_read_method_doc) - @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") def read( self, nrows=None, @@ -1689,7 +1626,7 @@ def read( else: data_formatted.append((col, data[col])) if requires_type_conversion: - data = DataFrame.from_dict(OrderedDict(data_formatted)) + data = DataFrame.from_dict(dict(data_formatted)) del data_formatted data = self._do_convert_missing(data, convert_missing) @@ -1728,7 +1665,7 @@ def any_startswith(x: str) -> bool: convert = True retyped_data.append((col, data[col].astype(dtype))) if convert: - data = DataFrame.from_dict(OrderedDict(retyped_data)) + data = DataFrame.from_dict(dict(retyped_data)) if index_col is not None: data = data.set_index(data.pop(index_col)) @@ -1841,7 +1778,7 @@ def _do_convert_categoricals( repeats = list(vc.index[vc > 1]) repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 - msg = """ + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -1852,13 +1789,13 @@ def _do_convert_categoricals( The repeated labels are: {repeats} """ - raise ValueError(msg.format(col=col, repeats=repeats)) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) else: cat_converted_data.append((col, data[col])) - data = DataFrame.from_dict(OrderedDict(cat_converted_data)) + data = DataFrame.from_dict(dict(cat_converted_data)) return data @property @@ -1921,13 +1858,15 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError("Endianness {endian} not understood".format(endian=endianness)) + raise ValueError(f"Endianness {endianness} not understood") def _pad_bytes(name, length): """ Take a char string and pads it with null bytes until it's length chars. """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) return name + "\x00" * (length - len(name)) @@ -1953,7 +1892,7 @@ def _convert_datetime_to_stata_type(fmt): ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError(f"Format {fmt} not implemented") def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -2003,9 +1942,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype == np.int8: return 251 else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): @@ -2032,24 +1969,12 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False if force_strl: return "%9s" if dtype.type == np.object_: - inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ("string", "unicode") or len(column) == 0): - raise ValueError( - "Column `{col}` cannot be exported.\n\nOnly " - "string-like object arrays containing all " - "strings or a mix of strings and None can be " - "exported. Object arrays containing only null " - "values are prohibited. Other object types" - "cannot be exported and must first be converted " - "to one of the supported " - "types.".format(col=column.name) - ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" else: - raise ValueError(excessive_string_length_error % column.name) + raise ValueError(excessive_string_length_error.format(column.name)) return "%" + str(max(itemsize, 1)) + "s" elif dtype == np.float64: return "%10.0g" @@ -2060,9 +1985,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False elif dtype == np.int8 or dtype == np.int16: return "%8.0g" else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") class StataWriter(StataParser): @@ -2090,8 +2013,6 @@ class StataWriter(StataParser): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2133,15 +2054,14 @@ class StataWriter(StataParser): """ _max_string_length = 244 + _encoding = "latin-1" - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -2150,7 +2070,6 @@ def __init__( super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2161,7 +2080,7 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._fname = _stringify_path(fname) + self._fname = stringify_path(fname) self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} self._converted_names = {} @@ -2185,7 +2104,8 @@ def _prepare_categoricals(self, data): data_formatted = [] for col, col_is_cat in zip(data, is_cat): if col_is_cat: - self._value_labels.append(StataValueLabel(data[col])) + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) dtype = data[col].cat.codes.dtype if dtype == np.int64: raise ValueError( @@ -2209,7 +2129,7 @@ def _prepare_categoricals(self, data): data_formatted.append((col, values)) else: data_formatted.append((col, data[col])) - return DataFrame.from_dict(OrderedDict(data_formatted)) + return DataFrame.from_dict(dict(data_formatted)) def _replace_nans(self, data): # return data @@ -2230,6 +2150,36 @@ def _update_strl_names(self): """No-op, forward compatibility""" pass + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + def _check_column_names(self, data): """ Checks column names to ensure that they are valid Stata column names. @@ -2253,14 +2203,7 @@ def _check_column_names(self, data): if not isinstance(name, str): name = str(name) - for c in name: - if ( - (c < "A" or c > "Z") - and (c < "a" or c > "z") - and (c < "0" or c > "9") - and c != "_" - ): - name = name.replace(c, "_") + name = self._validate_variable_name(name) # Variable name must not be a reserved word if name in self.RESERVED_WORDS: @@ -2300,7 +2243,7 @@ def _check_column_names(self, data): orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = "{0} -> {1}".format(orig_name, name) + msg = f"{orig_name} -> {name}" conversion_warning.append(msg) ws = invalid_name_doc.format("\n ".join(conversion_warning)) @@ -2311,12 +2254,12 @@ def _check_column_names(self, data): return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): - self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) - self.typlist.append(_dtype_to_stata_type(dtype, data[col])) + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) def _prepare_pandas(self, data): # NOTE: we might need a different API / class for pandas objects so @@ -2360,17 +2303,57 @@ def _prepare_pandas(self, data): new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) - self._set_formats_and_types(data, dtypes) + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: self.fmtlist[key] = self._convert_dates[key] + def _encode_strings(self): + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded.values)) + <= self._max_string_length + ): + self.data[col] = encoded + def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) + self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() self._write_variable_types() self._write_varnames() @@ -2393,9 +2376,8 @@ def write_file(self): os.unlink(self._fname) except OSError: warnings.warn( - "This save was not successful but {0} could not " - "be deleted. This file is not " - "valid.".format(self._fname), + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", ResourceWarning, ) raise exc @@ -2441,7 +2423,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2543,9 +2525,8 @@ def _write_variable_labels(self): is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + "can be encoded in Latin-1" ) self._write(_pad_bytes(label, 81)) else: @@ -2576,9 +2557,9 @@ def _prepare_data(self): typ = typlist[i] if typ <= self._max_string_length: data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) - stype = "S{type}".format(type=typ) + stype = f"S{typ}" dtypes[col] = stype - data[col] = data[col].str.encode(self._encoding).astype(stype) + data[col] = data[col].astype(stype) else: dtype = data[col].dtype if not native_byteorder: @@ -2640,7 +2621,7 @@ def _dtype_to_stata_type_117(dtype, column, force_strl): elif dtype == np.int8: return 65530 else: # pragma : no cover - raise NotImplementedError("Data type %s not supported." % dtype) + raise NotImplementedError(f"Data type {dtype} not supported.") def _pad_bytes_new(name, length): @@ -2688,7 +2669,7 @@ def __init__(self, df, columns, version=117, byteorder=None): self.df = df self.columns = columns - self._gso_table = OrderedDict((("", (0, 0)),)) + self._gso_table = {"": (0, 0)} if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) @@ -2718,7 +2699,7 @@ def generate_table(self): Returns ------- - gso_table : OrderedDict + gso_table : dict Ordered dictionary using the string found as keys and their lookup position (v,o) as values gso_df : DataFrame @@ -2764,19 +2745,13 @@ def generate_table(self): return gso_table, gso_df - def _encode(self, s): - """ - Python 3 compatibility shim - """ - return s.encode(self._encoding) - def generate_blob(self, gso_table): """ Generates the binary blob of GSOs that is written to the dta file. Parameters ---------- - gso_table : OrderedDict + gso_table : dict Ordered dictionary (str, vo) Returns @@ -2859,8 +2834,6 @@ class StataWriter117(StataWriter): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2911,15 +2884,14 @@ class StataWriter117(StataWriter): """ _max_string_length = 2045 + _dta_version = 117 - @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) def __init__( self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, @@ -2959,18 +2931,21 @@ def _write_header(self, data_label=None, time_stamp=None): self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes("117", "utf-8"), "release")) + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) # number of vars, 2 bytes assert self.nvar < 2 ** 16 bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) - # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) # data label 81 bytes, char, null terminated label = data_label[:80] if data_label is not None else "" - label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, "utf-8") + label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(label)) + label = label_len + label bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm @@ -3000,7 +2975,7 @@ def _write_header(self, data_label=None, time_stamp=None): + time_stamp.strftime(" %Y %H:%M") ) # '\x11' added due to inspection of Stata file - ts = b"\x11" + bytes(ts, "utf8") + ts = b"\x11" + bytes(ts, "utf-8") bio.write(self._tag(ts, "timestamp")) bio.seek(0) self._file.write(self._tag(bio.read(), "header")) @@ -3010,7 +2985,7 @@ def _write_map(self): the map with 0s. The second call writes the final map locations when all blocks have been written.""" if self._map is None: - self._map = OrderedDict( + self._map = dict( ( ("stata_data", 0), ("map", self._file.tell()), @@ -3047,9 +3022,11 @@ def _write_variable_types(self): def _write_varnames(self): self._update_map("varnames") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 for name in self.varlist: name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "varnames")) @@ -3061,21 +3038,24 @@ def _write_sortlist(self): def _write_formats(self): self._update_map("formats") bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 for fmt in self.fmtlist: - bio.write(_pad_bytes_new(fmt, 49)) + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) bio.seek(0) self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): self._update_map("value_label_names") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 for i in range(self.nvar): # Use variable name when categorical name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "value_label_names")) @@ -3084,7 +3064,9 @@ def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new("", 81) + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) if self._variable_labels is None: for _ in range(self.nvar): @@ -3098,14 +3080,15 @@ def _write_variable_labels(self): label = self._variable_labels[col] if len(label) > 80: raise ValueError("Variable labels must be 80 characters or fewer") - is_latin1 = all(ord(c) < 256 for c in label) - if not is_latin1: + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" ) - bio.write(_pad_bytes_new(label, 81)) + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) else: bio.write(blank) bio.seek(0) @@ -3137,7 +3120,7 @@ def _write_value_labels(self): self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: - lab = vl.generate_value_label(self._byteorder, self._encoding) + lab = vl.generate_value_label(self._byteorder) lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) @@ -3167,19 +3150,140 @@ def _convert_strls(self, data): ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols) + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): force_strl = col in self._convert_strl fmt = _dtype_to_default_stata_fmt( - dtype, data[col], dta_version=117, force_strl=force_strl + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriter118(StataWriter117): + """ + A class for writing Stata binary dta files in Stata 15 format (118) + + DTA 118 format files support unicode string data (both fixed and strL) + format. Unicode is also supported in value labels, variable labels and + the dataset label. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + + Returns + ------- + StataWriter118 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriter118 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriter118('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriter118('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding = "utf-8" + _dta_version = 118 + + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118 support most unicode characters. The only limatation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) or 128 <= ord(c) < 256: + name = name.replace(c, "_") + + return name diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index ebe047c58b889..55c861e384d67 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -38,7 +38,6 @@ - hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`) - boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`) - boxplot_frame and boxplot_frame_groupby -- tsplot (deprecated) - register and deregister (register converters for the tick formats) - Plots not called as `Series` and `DataFrame` methods: - table diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c11d94c381d6d..dd907457f7c32 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,9 +1,8 @@ import importlib -import warnings from pandas._config import get_option -from pandas.util._decorators import Appender +from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -22,7 +21,8 @@ def hist_series( yrot=None, figsize=None, bins=10, - **kwargs + backend=None, + **kwargs, ): """ Draw histogram of the input series using matplotlib. @@ -50,6 +50,14 @@ def hist_series( bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + **kwargs To be passed to the actual plotting function. @@ -62,7 +70,7 @@ def hist_series( -------- matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. """ - plot_backend = _get_plot_backend() + plot_backend = _get_plot_backend(backend) return plot_backend.hist_series( self, by=by, @@ -74,7 +82,7 @@ def hist_series( yrot=yrot, figsize=figsize, bins=bins, - **kwargs + **kwargs, ) @@ -93,7 +101,8 @@ def hist_frame( figsize=None, layout=None, bins=10, - **kwargs + backend=None, + **kwargs, ): """ Make a histogram of the DataFrame's. @@ -145,6 +154,14 @@ def hist_frame( bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + **kwargs All other plotting keyword arguments to be passed to :meth:`matplotlib.pyplot.hist`. @@ -172,7 +189,7 @@ def hist_frame( ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) >>> hist = df.hist(bins=3) """ - plot_backend = _get_plot_backend() + plot_backend = _get_plot_backend(backend) return plot_backend.hist_frame( data, column=column, @@ -188,185 +205,202 @@ def hist_frame( figsize=figsize, layout=layout, bins=bins, - **kwargs + **kwargs, ) -def boxplot( - data, - column=None, - by=None, - ax=None, - fontsize=None, - rot=0, - grid=True, - figsize=None, - layout=None, - return_type=None, - **kwargs -): - """ - Make a box plot from DataFrame columns. - - Make a box-and-whisker plot from DataFrame columns, optionally grouped - by some other columns. A box plot is a method for graphically depicting - groups of numerical data through their quartiles. - The box extends from the Q1 to Q3 quartile values of the data, - with a line at the median (Q2). The whiskers extend from the edges - of box to show the range of the data. The position of the whiskers - is set by default to `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box. - Outlier points are those past the end of the whiskers. - - For further details see - Wikipedia's entry for `boxplot `_. +_boxplot_doc = """ +Make a box plot from DataFrame columns. + +Make a box-and-whisker plot from DataFrame columns, optionally grouped +by some other columns. A box plot is a method for graphically depicting +groups of numerical data through their quartiles. +The box extends from the Q1 to Q3 quartile values of the data, +with a line at the median (Q2). The whiskers extend from the edges +of box to show the range of the data. The position of the whiskers +is set by default to `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box. +Outlier points are those past the end of the whiskers. + +For further details see +Wikipedia's entry for `boxplot `_. + +Parameters +---------- +column : str or list of str, optional + Column name or list of names, or vector. + Can be any valid input to :meth:`pandas.DataFrame.groupby`. +by : str or array-like, optional + Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. + One box-plot will be done per value of columns in `by`. +ax : object of class matplotlib.axes.Axes, optional + The matplotlib axes to be used by boxplot. +fontsize : float or str + Tick label font size in points or as a string (e.g., `large`). +rot : int or float, default 0 + The rotation angle of labels (in degrees) + with respect to the screen coordinate system. +grid : bool, default True + Setting this to True will show the grid. +figsize : A tuple (width, height) in inches + The size of the figure to create in matplotlib. +layout : tuple (rows, columns), optional + For example, (3, 5) will display the subplots + using 3 columns and 5 rows, starting from the top-left. +return_type : {'axes', 'dict', 'both'} or None, default 'axes' + The kind of object to return. The default is ``axes``. + + * 'axes' returns the matplotlib axes the boxplot is drawn on. + * 'dict' returns a dictionary whose values are the matplotlib + Lines of the boxplot. + * 'both' returns a namedtuple with the axes and dict. + * when grouping with ``by``, a Series mapping columns to + ``return_type`` is returned. + + If ``return_type`` is `None`, a NumPy array + of axes with the same shape as ``layout`` is returned. +%(backend)s\ + +**kwargs + All other plotting keyword arguments to be passed to + :func:`matplotlib.pyplot.boxplot`. + +Returns +------- +result + See Notes. + +See Also +-------- +Series.plot.hist: Make a histogram. +matplotlib.pyplot.boxplot : Matplotlib equivalent plot. + +Notes +----- +The return type depends on the `return_type` parameter: + +* 'axes' : object of class matplotlib.axes.Axes +* 'dict' : dict of matplotlib.lines.Line2D objects +* 'both' : a namedtuple with structure (ax, lines) + +For data grouped with ``by``, return a Series of the above or a numpy +array: + +* :class:`~pandas.Series` +* :class:`~numpy.array` (for ``return_type = None``) + +Use ``return_type='dict'`` when you want to tweak the appearance +of the lines after plotting. In this case a dict containing the Lines +making up the boxes, caps, fliers, medians, and whiskers is returned. + +Examples +-------- + +Boxplots can be created for every column in the dataframe +by ``df.boxplot()`` or indicating the columns to be used: + +.. plot:: + :context: close-figs + + >>> np.random.seed(1234) + >>> df = pd.DataFrame(np.random.randn(10, 4), + ... columns=['Col1', 'Col2', 'Col3', 'Col4']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) + +Boxplots of variables distributions grouped by the values of a third +variable can be created using the option ``by``. For instance: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 2), + ... columns=['Col1', 'Col2']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> boxplot = df.boxplot(by='X') + +A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot +in order to group the data by combination of the variables in the x-axis: + +.. plot:: + :context: close-figs + + >>> df = pd.DataFrame(np.random.randn(10, 3), + ... columns=['Col1', 'Col2', 'Col3']) + >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', + ... 'B', 'B', 'B', 'B', 'B']) + >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', + ... 'B', 'A', 'B', 'A', 'B']) + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + +The layout of boxplot can be adjusted giving a tuple to ``layout``: + +.. plot:: + :context: close-figs + + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... layout=(2, 1)) - Parameters - ---------- - column : str or list of str, optional - Column name or list of names, or vector. - Can be any valid input to :meth:`pandas.DataFrame.groupby`. - by : str or array-like, optional - Column in the DataFrame to :meth:`pandas.DataFrame.groupby`. - One box-plot will be done per value of columns in `by`. - ax : object of class matplotlib.axes.Axes, optional - The matplotlib axes to be used by boxplot. - fontsize : float or str - Tick label font size in points or as a string (e.g., `large`). - rot : int or float, default 0 - The rotation angle of labels (in degrees) - with respect to the screen coordinate system. - grid : bool, default True - Setting this to True will show the grid. - figsize : A tuple (width, height) in inches - The size of the figure to create in matplotlib. - layout : tuple (rows, columns), optional - For example, (3, 5) will display the subplots - using 3 columns and 5 rows, starting from the top-left. - return_type : {'axes', 'dict', 'both'} or None, default 'axes' - The kind of object to return. The default is ``axes``. - - * 'axes' returns the matplotlib axes the boxplot is drawn on. - * 'dict' returns a dictionary whose values are the matplotlib - Lines of the boxplot. - * 'both' returns a namedtuple with the axes and dict. - * when grouping with ``by``, a Series mapping columns to - ``return_type`` is returned. - - If ``return_type`` is `None`, a NumPy array - of axes with the same shape as ``layout`` is returned. - **kwargs - All other plotting keyword arguments to be passed to - :func:`matplotlib.pyplot.boxplot`. - - Returns - ------- - result - See Notes. - - See Also - -------- - Series.plot.hist: Make a histogram. - matplotlib.pyplot.boxplot : Matplotlib equivalent plot. +Additional formatting can be done to the boxplot, like suppressing the grid +(``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) +or changing the fontsize (i.e. ``fontsize=15``): - Notes - ----- - The return type depends on the `return_type` parameter: - - * 'axes' : object of class matplotlib.axes.Axes - * 'dict' : dict of matplotlib.lines.Line2D objects - * 'both' : a namedtuple with structure (ax, lines) +.. plot:: + :context: close-figs - For data grouped with ``by``, return a Series of the above or a numpy - array: + >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) - * :class:`~pandas.Series` - * :class:`~numpy.array` (for ``return_type = None``) +The parameter ``return_type`` can be used to select the type of element +returned by `boxplot`. When ``return_type='axes'`` is selected, +the matplotlib axes on which the boxplot is drawn are returned: - Use ``return_type='dict'`` when you want to tweak the appearance - of the lines after plotting. In this case a dict containing the Lines - making up the boxes, caps, fliers, medians, and whiskers is returned. - - Examples - -------- + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes') + >>> type(boxplot) + - Boxplots can be created for every column in the dataframe - by ``df.boxplot()`` or indicating the columns to be used: +When grouping with ``by``, a Series mapping columns to ``return_type`` +is returned: - .. plot:: - :context: close-figs + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type='axes') + >>> type(boxplot) + - >>> np.random.seed(1234) - >>> df = pd.DataFrame(np.random.randn(10,4), - ... columns=['Col1', 'Col2', 'Col3', 'Col4']) - >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) +If ``return_type`` is `None`, a NumPy array of axes with the same shape +as ``layout`` is returned: - Boxplots of variables distributions grouped by the values of a third - variable can be created using the option ``by``. For instance: + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type=None) + >>> type(boxplot) + +""" - .. plot:: - :context: close-figs - >>> df = pd.DataFrame(np.random.randn(10, 2), - ... columns=['Col1', 'Col2']) - >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', - ... 'B', 'B', 'B', 'B', 'B']) - >>> boxplot = df.boxplot(by='X') +_backend_doc = """\ +backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. - A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot - in order to group the data by combination of the variables in the x-axis: + .. versionadded:: 1.0.0 +""" - .. plot:: - :context: close-figs - >>> df = pd.DataFrame(np.random.randn(10,3), - ... columns=['Col1', 'Col2', 'Col3']) - >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', - ... 'B', 'B', 'B', 'B', 'B']) - >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', - ... 'B', 'A', 'B', 'A', 'B']) - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) - - The layout of boxplot can be adjusted giving a tuple to ``layout``: - - .. plot:: - :context: close-figs - - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... layout=(2, 1)) - - Additional formatting can be done to the boxplot, like suppressing the grid - (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``) - or changing the fontsize (i.e. ``fontsize=15``): - - .. plot:: - :context: close-figs - - >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) - - The parameter ``return_type`` can be used to select the type of element - returned by `boxplot`. When ``return_type='axes'`` is selected, - the matplotlib axes on which the boxplot is drawn are returned: - - >>> boxplot = df.boxplot(column=['Col1','Col2'], return_type='axes') - >>> type(boxplot) - - - When grouping with ``by``, a Series mapping columns to ``return_type`` - is returned: - - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type='axes') - >>> type(boxplot) - - - If ``return_type`` is `None`, a NumPy array of axes with the same shape - as ``layout`` is returned: - - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type=None) - >>> type(boxplot) - - """ +@Substitution(backend="") +@Appender(_boxplot_doc) +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwargs, +): plot_backend = _get_plot_backend("matplotlib") return plot_backend.boxplot( data, @@ -379,11 +413,12 @@ def boxplot( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) -@Appender(boxplot.__doc__) +@Substitution(backend=_backend_doc) +@Appender(_boxplot_doc) def boxplot_frame( self, column=None, @@ -395,9 +430,10 @@ def boxplot_frame( figsize=None, layout=None, return_type=None, - **kwargs + backend=None, + **kwargs, ): - plot_backend = _get_plot_backend() + plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame( self, column=column, @@ -409,7 +445,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwargs, ) @@ -425,7 +461,8 @@ def boxplot_frame_groupby( layout=None, sharex=False, sharey=True, - **kwargs + backend=None, + **kwargs, ): """ Make box plots from DataFrameGroupBy data. @@ -454,6 +491,14 @@ def boxplot_frame_groupby( Whether y-axes will be shared among subplots. .. versionadded:: 0.23.1 + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + **kwargs All other plotting keyword arguments to be passed to matplotlib's boxplot function. @@ -477,7 +522,7 @@ def boxplot_frame_groupby( >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) >>> boxplot_frame_groupby(grouped, subplots=False) """ - plot_backend = _get_plot_backend() + plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame_groupby( grouped, subplots=subplots, @@ -490,7 +535,7 @@ def boxplot_frame_groupby( layout=layout, sharex=sharex, sharey=sharey, - **kwargs + **kwargs, ) @@ -586,6 +631,14 @@ class PlotAccessor(PandasObject): labels with "(right)" in the legend. include_bool : bool, default is False If True, boolean values can be plotted. + backend : str, default None + Backend to use instead of the backend specified in the option + ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to + specify the ``plotting.backend`` for the whole session, set + ``pd.options.plotting.backend``. + + .. versionadded:: 1.0.0 + **kwargs Options to pass to matplotlib plotting method. @@ -682,26 +735,23 @@ def _get_call_args(backend_name, data, args, kwargs): ] else: raise TypeError( - ( - "Called plot accessor for type {}, expected Series or DataFrame" - ).format(type(data).__name__) + f"Called plot accessor for type {type(data).__name__}, " + "expected Series or DataFrame" ) if args and isinstance(data, ABCSeries): + positional_args = str(args)[1:-1] + keyword_args = ", ".join( + f"{name}={repr(value)}" for (name, default), value in zip(arg_def, args) + ) msg = ( "`Series.plot()` should not be called with positional " "arguments, only keyword arguments. The order of " "positional arguments will change in the future. " - "Use `Series.plot({})` instead of `Series.plot({})`." - ) - positional_args = str(args)[1:-1] - keyword_args = ", ".join( - "{}={!r}".format(name, value) - for (name, default), value in zip(arg_def, args) - ) - warnings.warn( - msg.format(keyword_args, positional_args), FutureWarning, stacklevel=3 + f"Use `Series.plot({keyword_args})` instead of " + f"`Series.plot({positional_args})`." ) + raise TypeError(msg) pos_args = {name: value for value, (name, _) in zip(args, arg_def)} if backend_name == "pandas.plotting._matplotlib": @@ -715,15 +765,20 @@ def _get_call_args(backend_name, data, args, kwargs): return x, y, kind, kwargs def __call__(self, *args, **kwargs): - plot_backend = _get_plot_backend() + plot_backend = _get_plot_backend(kwargs.pop("backend", None)) x, y, kind, kwargs = self._get_call_args( plot_backend.__name__, self._parent, args, kwargs ) kind = self._kind_aliases.get(kind, kind) + + # when using another backend, get out of the way + if plot_backend.__name__ != "pandas.plotting._matplotlib": + return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs) + if kind not in self._all_kinds: - raise ValueError("{} is not a valid plot kind".format(kind)) + raise ValueError(f"{kind} is not a valid plot kind") # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -737,14 +792,13 @@ def __call__(self, *args, **kwargs): if isinstance(data, ABCDataFrame): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: - raise ValueError( - ("plot kind {} can only be used for data frames").format(kind) - ) + raise ValueError(f"plot kind {kind} can only be used for data frames") elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): if y is None and kwargs.get("subplots") is False: - msg = "{} requires either y column or 'subplots=True'" - raise ValueError(msg.format(kind)) + raise ValueError( + f"{kind} requires either y column or 'subplots=True'" + ) elif y is not None: if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] @@ -1580,12 +1634,11 @@ def _find_backend(backend: str): _backends[backend] = module return module - msg = ( - "Could not find plotting backend '{name}'. Ensure that you've installed the " - "package providing the '{name}' entrypoint, or that the package has a" + raise ValueError( + f"Could not find plotting backend '{backend}'. Ensure that you've installed " + f"the package providing the '{backend}' entrypoint, or that the package has a " "top-level `.plot` method." ) - raise ValueError(msg.format(name=backend)) def _get_plot_backend(backend=None): diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 206600ad37acc..27b1d55fe1bd6 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Dict, Type + from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, @@ -24,10 +26,12 @@ radviz, scatter_matrix, ) -from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table -PLOT_CLASSES = { +if TYPE_CHECKING: + from pandas.plotting._matplotlib.core import MPLPlot # noqa: F401 + +PLOT_CLASSES: Dict[str, Type["MPLPlot"]] = { "line": LinePlot, "bar": BarPlot, "barh": BarhPlot, @@ -66,7 +70,6 @@ def plot(data, kind, **kwargs): "boxplot", "boxplot_frame", "boxplot_frame_groupby", - "tsplot", "table", "andrews_curves", "autocorrelation_plot", diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index cfd6c3519d82c..deeeb0016142c 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -74,9 +74,8 @@ def _validate_color_args(self): for key, values in self.color.items(): if key not in valid_keys: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: self.color = None @@ -115,7 +114,7 @@ def maybe_color_bp(self, bp): def _make_plot(self): if self.subplots: - self._return_obj = pd.Series() + self._return_obj = pd.Series(dtype=object) for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -184,7 +183,7 @@ def _grouped_plot_by_column( ax=None, layout=None, return_type=None, - **kwargs + **kwargs, ): grouped = data.groupby(by) if columns is None: @@ -217,7 +216,7 @@ def _grouped_plot_by_column( result = axes byline = by[0] if len(by) == 1 else by - fig.suptitle("Boxplot grouped by {byline}".format(byline=byline)) + fig.suptitle(f"Boxplot grouped by {byline}") fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return result @@ -234,7 +233,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -268,9 +267,8 @@ def _get_colors(): result[key_to_index[key]] = value else: raise ValueError( - "color dict contains invalid " - "key '{0}' " - "The key must be either {1}".format(key, valid_keys) + f"color dict contains invalid key '{key}'. " + f"The key must be either {valid_keys}" ) else: result.fill(colors) @@ -359,7 +357,7 @@ def boxplot_frame( figsize=None, layout=None, return_type=None, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -374,7 +372,7 @@ def boxplot_frame( figsize=figsize, layout=layout, return_type=return_type, - **kwds + **kwds, ) plt.draw_if_interactive() return ax @@ -392,7 +390,7 @@ def boxplot_frame_groupby( layout=None, sharex=False, sharey=True, - **kwds + **kwds, ): if subplots is True: naxes = len(grouped) @@ -407,7 +405,8 @@ def boxplot_frame_groupby( ) axes = _flatten(axes) - ret = pd.Series() + ret = pd.Series(dtype=object) + for (key, group), ax in zip(grouped, axes): d = group.boxplot( ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds @@ -432,6 +431,6 @@ def boxplot_frame_groupby( ax=ax, figsize=figsize, layout=layout, - **kwds + **kwds, ) return ret diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 946ce8bcec97f..5b37ebb42aecc 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -24,9 +24,8 @@ ) from pandas.core.dtypes.generic import ABCSeries -from pandas import get_option +from pandas import Index, get_option import pandas.core.common as com -from pandas.core.index import Index from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range import pandas.core.tools.datetimes as tools @@ -125,7 +124,7 @@ def time2num(d): if isinstance(d, str): parsed = tools.to_datetime(d) if not isinstance(parsed, datetime): - raise ValueError("Could not parse time {d}".format(d=d)) + raise ValueError(f"Could not parse time {d}") return _to_ordinalf(parsed.time()) if isinstance(d, pydt.time): return _to_ordinalf(d) @@ -244,7 +243,7 @@ def get_datevalue(date, freq): return date elif date is None: return None - raise ValueError("Unrecognizable date '{date}'".format(date=date)) + raise ValueError(f"Unrecognizable date '{date}'") def _dt_to_float_ordinal(dt): @@ -387,12 +386,12 @@ def __call__(self): except ValueError: return [] - if dmin > dmax: - dmax, dmin = dmin, dmax # We need to cap at the endpoints of valid datetime # FIXME: dont leave commented-out # TODO(wesm) unused? + # if dmin > dmax: + # dmax, dmin = dmin, dmax # delta = relativedelta(dmax, dmin) # try: # start = dmin - delta @@ -421,15 +420,14 @@ def __call__(self): if estimate > self.MAXTICKS * 2: raise RuntimeError( - ( - "MillisecondLocator estimated to generate " - "{estimate:d} ticks from {dmin} to {dmax}: " - "exceeds Locator.MAXTICKS" - "* 2 ({arg:d}) " - ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) + "MillisecondLocator estimated to generate " + f"{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + f"* 2 ({self.MAXTICKS * 2:d}) " ) - freq = "%dL" % self._get_interval() + interval = self._get_interval() + freq = f"{interval}L" tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) @@ -581,7 +579,7 @@ def _daily_finder(vmin, vmax, freq): elif freq == FreqGroup.FR_HR: periodsperday = 24 else: # pragma: no cover - raise ValueError("unexpected frequency: {freq}".format(freq=freq)) + raise ValueError(f"unexpected frequency: {freq}") periodsperyear = 365 * periodsperday periodspermonth = 28 * periodsperday @@ -940,8 +938,7 @@ def get_finder(freq): elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover - errmsg = "Unsupported frequency: {freq}".format(freq=freq) - raise NotImplementedError(errmsg) + raise NotImplementedError(f"Unsupported frequency: {freq}") class TimeSeries_DateLocator(Locator): @@ -1100,6 +1097,8 @@ def __call__(self, x, pos=0): return "" else: fmt = self.formatdict.pop(x, "") + if isinstance(fmt, np.bytes_): + fmt = fmt.decode("utf-8") return Period(ordinal=int(x), freq=self.freq).strftime(fmt) @@ -1118,11 +1117,11 @@ def format_timedelta_ticks(x, pos, n_decimals): h, m = divmod(m, 60) d, h = divmod(h, 24) decimals = int(ns * 10 ** (n_decimals - 9)) - s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) + s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}" if n_decimals > 0: - s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) + s += f".{decimals:0{n_decimals}d}" if d != 0: - s = "{:d} days ".format(int(d)) + s + s = f"{int(d):d} days {s}" return s def __call__(self, x, pos=0): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 541dca715e814..2d68bb46a8ada 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -57,7 +57,7 @@ def _kind(self): _layout_type = "vertical" _default_rot = 0 - orientation = None # type: Optional[str] + orientation: Optional[str] = None _pop_attributes = [ "label", "style", @@ -102,7 +102,7 @@ def __init__( table=False, layout=None, include_bool=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -193,15 +193,7 @@ def __init__( self._validate_color_args() def _validate_color_args(self): - if "color" not in self.kwds and "colors" in self.kwds: - warnings.warn( - ( - "'colors' is being deprecated. Please use 'color'" - "instead of 'colors'" - ) - ) - colors = self.kwds.pop("colors") - self.kwds["color"] = colors + import matplotlib.colors if ( "color" in self.kwds @@ -234,13 +226,13 @@ def _validate_color_args(self): styles = [self.style] # need only a single match for s in styles: - if re.match("^[a-z]+?", s) is not None: - raise ValueError( - "Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol" - ) + for char in s: + if char in matplotlib.colors.BASE_COLORS: + raise ValueError( + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the other or " + "pass 'style' without a color symbol" + ) def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -248,12 +240,6 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - # TODO: unused? - # if self.sort_columns: - # columns = com.try_sort(data.columns) - # else: - # columns = data.columns - for col, values in data.items(): if keep_index is True: yield col, values @@ -346,8 +332,7 @@ def _setup_subplots(self): if input_log - valid_log: invalid_log = next(iter((input_log - valid_log))) raise ValueError( - "Boolean, None and 'sym' are valid options," - " '{}' is given.".format(invalid_log) + f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." ) if self.logx is True or self.loglog is True: @@ -409,6 +394,10 @@ def _compute_plot_data(self): include_type = [np.number] exclude_type = ["timedelta"] + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category"]) + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: @@ -498,14 +487,13 @@ def _adorn_subplots(self): if self.subplots: if is_list_like(self.title): if len(self.title) != self.nseries: - msg = ( + raise ValueError( "The length of `title` must equal the number " "of columns if using `title` of type `list` " "and `subplots=True`.\n" - "length of title = {}\n" - "number of columns = {}" - ).format(len(self.title), self.nseries) - raise ValueError(msg) + f"length of title = {len(self.title)}\n" + f"number of columns = {self.nseries}" + ) for (ax, title) in zip(self.axes, self.title): ax.set_title(title) @@ -810,11 +798,10 @@ def match_labels(data, e): or (err_shape[1] != 2) or (err_shape[2] != len(self.data)) ): - msg = ( + raise ValueError( "Asymmetrical error bars should be provided " - + "with the shape (%u, 2, %u)" % (self.nseries, len(self.data)) + f"with the shape ({self.nseries}, 2, {len(self.data)})" ) - raise ValueError(msg) # broadcast errors to each data series if len(err) == 1: @@ -824,7 +811,7 @@ def match_labels(data, e): err = np.tile([err], (self.nseries, len(self.data))) else: - msg = "No valid {label} detected".format(label=label) + msg = f"No valid {label} detected" raise ValueError(msg) return err @@ -882,10 +869,13 @@ def __init__(self, data, x, y, **kwargs): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") + + # Scatter plot allows to plot objects data + if self._kind == "hexbin": + if len(self.data[x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y @@ -982,7 +972,7 @@ def _make_plot(self): c=c_values, label=label, cmap=cmap, - **self.kwds + **self.kwds, ) if cb: cbar_label = c if c_is_column else "" @@ -1092,7 +1082,7 @@ def _make_plot(self): column_num=i, stacking_id=stacking_id, is_errorbar=is_errorbar, - **kwds + **kwds, ) self._add_legend_handle(newlines[0], label, index=i) @@ -1175,7 +1165,7 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): raise ValueError( "When stacked is True, each column must be either " "all positive or negative." - "{0} contains both positive and negative values".format(label) + f"{label} contains both positive and negative values" ) @classmethod @@ -1247,7 +1237,7 @@ def _plot( column_num=None, stacking_id=None, is_errorbar=False, - **kwds + **kwds, ): if column_num == 0: @@ -1383,7 +1373,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) ax.set_title(label) elif self.stacked: @@ -1398,7 +1388,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) @@ -1412,7 +1402,7 @@ def _make_plot(self): start=start, label=label, log=self.log, - **kwds + **kwds, ) self._add_legend_handle(rect, label, index=i) @@ -1470,7 +1460,7 @@ class PiePlot(MPLPlot): def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) if (data < 0).any().any(): - raise ValueError("{0} doesn't allow negative values".format(kind)) + raise ValueError(f"{kind} doesn't allow negative values") MPLPlot.__init__(self, data, kind=kind, **kwargs) def _args_adjust(self): diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index c4ac9ead3f3d3..f8b2c7ab123d0 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -29,7 +27,7 @@ def _args_adjust(self): values = np.ravel(values) values = values[~isna(values)] - hist, self.bins = np.histogram( + _, self.bins = np.histogram( values, bins=self.bins, range=self.kwds.get("range", None), @@ -49,7 +47,7 @@ def _plot( bottom=0, column_num=0, stacking_id=None, - **kwds + **kwds, ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) @@ -145,7 +143,7 @@ def _plot( ind=None, column_num=None, stacking_id=None, - **kwds + **kwds, ): from scipy.stats import gaussian_kde @@ -177,17 +175,15 @@ def _grouped_plot( layout=None, rot=0, ax=None, - **kwargs + **kwargs, ): if figsize == "default": # allowed to specify mpl default with 'default' - warnings.warn( - "figsize='default' is deprecated. Specify figure size by tuple instead", - FutureWarning, - stacklevel=5, + raise ValueError( + "figsize='default' is no longer supported. " + "Specify figure size by tuple instead" ) - figsize = None grouped = data.groupby(by) if column is not None: @@ -226,7 +222,7 @@ def _grouped_hist( xrot=None, ylabelsize=None, yrot=None, - **kwargs + **kwargs, ): """ Grouped histogram @@ -254,7 +250,8 @@ def _grouped_hist( def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) - xrot = xrot or rot + if xrot is None: + xrot = rot fig, axes = _grouped_plot( plot_group, @@ -290,7 +287,7 @@ def hist_series( yrot=None, figsize=None, bins=10, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -335,7 +332,7 @@ def hist_series( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) if hasattr(axes, "ndim"): @@ -359,7 +356,7 @@ def hist_frame( figsize=None, layout=None, bins=10, - **kwds + **kwds, ): if by is not None: axes = _grouped_hist( @@ -377,7 +374,7 @@ def hist_frame( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, - **kwds + **kwds, ) return axes diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 6d5a94c4d5ff8..0720f544203f7 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -22,7 +22,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwds + **kwds, ): df = frame._get_numeric_data() n = df.columns.size @@ -160,7 +160,7 @@ def normalize(series): to_plot[kls][1], color=colors[i], label=pprint_thing(kls), - **kwds + **kwds, ) ax.legend() @@ -315,7 +315,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds + **kwds, ): import matplotlib.pyplot as plt @@ -395,7 +395,7 @@ def lag_plot(series, lag=1, ax=None, **kwds): if ax is None: ax = plt.gca() ax.set_xlabel("y(t)") - ax.set_ylabel("y(t + {lag})".format(lag=lag)) + ax.set_ylabel(f"y(t + {lag})") ax.scatter(y1, y2, **kwds) return ax diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 927b9cf4e392a..fd69265b18a5b 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -20,7 +20,7 @@ def _get_standard_colors( cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: - raise ValueError("Colormap {0} is not recognized".format(cmap)) + raise ValueError(f"Colormap {cmap} is not recognized") colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] elif color is not None: if colormap is not None: diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 931c699d9b9fd..dd048114142f3 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -1,7 +1,6 @@ # TODO: Use the fact that axis can have units to simplify the process import functools -import warnings import numpy as np @@ -25,7 +24,6 @@ TimeSeries_DateFormatter, TimeSeries_DateLocator, TimeSeries_TimedeltaFormatter, - register_pandas_matplotlib_converters, ) import pandas.tseries.frequencies as frequencies from pandas.tseries.offsets import DateOffset @@ -34,49 +32,6 @@ # Plotting functions and monkey patches -@register_pandas_matplotlib_converters -def tsplot(series, plotf, ax=None, **kwargs): - """ - Plots a Series on the given Matplotlib axes or the current axes - - Parameters - ---------- - axes : Axes - series : Series - - Notes - _____ - Supports same kwargs as Axes.plot - - - .. deprecated:: 0.23.0 - Use Series.plot() instead - """ - import matplotlib.pyplot as plt - - warnings.warn( - "'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, - stacklevel=3, - ) - - # Used inferred freq is possible, need a test case for inferred - if ax is None: - ax = plt.gca() - - freq, series = _maybe_resample(series, ax, kwargs) - - # Set ax with freq info - _decorate_axes(ax, freq, kwargs) - ax._plot_data.append((series, plotf, kwargs)) - lines = plotf(ax, series.index._mpl_repr(), series.values, **kwargs) - - # set date formatter, locators and rescale limits - format_dateaxis(ax, ax.freq, series.index) - return lines - - def _maybe_resample(series, ax, kwargs): # resample against axes freq if necessary freq, ax_freq = _get_freq(ax, series) @@ -307,7 +262,8 @@ def _maybe_convert_index(ax, data): def _format_coord(freq, t, y): - return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y) + time_period = Period(ordinal=int(t), freq=freq) + return f"t = {time_period} y = {y:8f}" def format_dateaxis(subplot, freq, index): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index caa0167c06389..dd4034a97f58e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -60,10 +60,7 @@ def _get_layout(nplots, layout=None, layout_type="box"): if nrows * ncols < nplots: raise ValueError( - "Layout of {nrows}x{ncols} must be larger " - "than required size {nplots}".format( - nrows=nrows, ncols=ncols, nplots=nplots - ) + f"Layout of {nrows}x{ncols} must be larger than required size {nplots}" ) return layout @@ -101,7 +98,7 @@ def _subplots( ax=None, layout=None, layout_type="box", - **fig_kw + **fig_kw, ): """Create a figure with a set of subplots already made. @@ -203,8 +200,8 @@ def _subplots( return fig, ax else: raise ValueError( - "The number of passed axes must be {0}, the " - "same as the output plot".format(naxes) + f"The number of passed axes must be {naxes}, the " + "same as the output plot" ) fig = ax.get_figure() diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 8435569d8bc61..ccd42d3940431 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -1,7 +1,4 @@ from contextlib import contextmanager -import warnings - -from pandas.util._decorators import deprecate_kwarg from pandas.plotting._core import _get_plot_backend @@ -82,7 +79,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwargs + **kwargs, ): """ Draw a matrix of scatter plots. @@ -134,7 +131,7 @@ def scatter_matrix( density_kwds=density_kwds, hist_kwds=hist_kwds, range_padding=range_padding, - **kwargs + **kwargs, ) @@ -168,7 +165,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): colormap : str or :class:`matplotlib.colors.Colormap`, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. - kwds : optional + **kwds Options to pass to matplotlib scatter plotting method. Returns @@ -207,11 +204,10 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): ax=ax, color=color, colormap=colormap, - **kwds + **kwds, ) -@deprecate_kwarg(old_arg_name="data", new_arg_name="frame") def andrews_curves( frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwargs ): @@ -255,7 +251,7 @@ def andrews_curves( samples=samples, color=color, colormap=colormap, - **kwargs + **kwargs, ) @@ -283,7 +279,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): greater or equal than the length of the `series`. samples : int, default 500 Number of times the bootstrap procedure is performed. - **kwds : + **kwds Options to pass to matplotlib plotting method. Returns @@ -311,8 +307,6 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): ) -@deprecate_kwarg(old_arg_name="colors", new_arg_name="color") -@deprecate_kwarg(old_arg_name="data", new_arg_name="frame", stacklevel=3) def parallel_coordinates( frame, class_column, @@ -325,7 +319,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwargs + **kwargs, ): """ Parallel coordinates plotting. @@ -364,7 +358,7 @@ def parallel_coordinates( -------- >>> from matplotlib import pyplot as plt >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master' - '/pandas/tests/data/iris.csv') + '/pandas/tests/data/csv/iris.csv') >>> pd.plotting.parallel_coordinates( df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')) @@ -383,7 +377,7 @@ def parallel_coordinates( axvlines=axvlines, axvlines_kwds=axvlines_kwds, sort_labels=sort_labels, - **kwargs + **kwargs, ) @@ -396,7 +390,8 @@ def lag_plot(series, lag=1, ax=None, **kwds): series : Time series lag : lag of the scatter plot, default 1 ax : Matplotlib axis object, optional - kwds : Matplotlib scatter method keyword arguments, optional + **kwds + Matplotlib scatter method keyword arguments. Returns ------- @@ -425,39 +420,13 @@ def autocorrelation_plot(series, ax=None, **kwargs): return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) -def tsplot(series, plotf, ax=None, **kwargs): - """ - Plots a Series on the given Matplotlib axes or the current axes - - Parameters - ---------- - axes : Axes - series : Series - - Notes - _____ - Supports same kwargs as Axes.plot - - - .. deprecated:: 0.23.0 - Use Series.plot() instead - """ - warnings.warn( - "'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, - stacklevel=2, - ) - plot_backend = _get_plot_backend("matplotlib") - return plot_backend.tsplot(series=series, plotf=plotf, ax=ax, **kwargs) - - class _Options(dict): """ Stores pandas plotting options. + Allows for parameter aliasing so you can just use parameter names that are the same as the plot function parameters, but is stored in a canonical - format that makes it easy to breakdown into groups later + format that makes it easy to breakdown into groups later. """ # alias so the names are same as plotting method parameter names @@ -466,15 +435,12 @@ class _Options(dict): def __init__(self, deprecated=False): self._deprecated = deprecated - # self['xaxis.compat'] = False super().__setitem__("xaxis.compat", False) def __getitem__(self, key): key = self._get_canonical_key(key) if key not in self: - raise ValueError( - "{key} is not a valid pandas plotting option".format(key=key) - ) + raise ValueError(f"{key} is not a valid pandas plotting option") return super().__getitem__(key) def __setitem__(self, key, value): @@ -484,10 +450,10 @@ def __setitem__(self, key, value): def __delitem__(self, key): key = self._get_canonical_key(key) if key in self._DEFAULT_KEYS: - raise ValueError("Cannot remove default parameter {key}".format(key=key)) + raise ValueError(f"Cannot remove default parameter {key}") return super().__delitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: key = self._get_canonical_key(key) return super().__contains__(key) diff --git a/pandas/testing.py b/pandas/testing.py index acae47367d997..0445fa5b5efc0 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -1,11 +1,17 @@ -# flake8: noqa - """ Public testing utility functions. """ -from pandas.util.testing import ( +from pandas._testing import ( + assert_extension_array_equal, assert_frame_equal, assert_index_equal, assert_series_equal, ) + +__all__ = [ + "assert_extension_array_equal", + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", +] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 3a8e263ac2a6d..8b897524cb053 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,8 +1,9 @@ +import sys from typing import List import pandas as pd from pandas import api, compat -import pandas.util.testing as tm +import pandas._testing as tm class Base: @@ -13,14 +14,13 @@ def check(self, namespace, expected, ignored=None): result = sorted(f for f in dir(namespace) if not f.startswith("__")) if ignored is not None: - result = sorted(list(set(result) - set(ignored))) + result = sorted(set(result) - set(ignored)) expected = sorted(expected) tm.assert_almost_equal(result, expected) class TestPDApi(Base): - # these are optionally imported based on testing # & need to be ignored ignored = ["tests", "locale", "conftest"] @@ -43,10 +43,10 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules = [] # type: List[str] + deprecated_modules: List[str] = ["np", "datetime"] # misc - misc = ["IndexSlice", "NaT"] + misc = ["IndexSlice", "NaT", "NA"] # top-level classes classes = [ @@ -68,7 +68,6 @@ class TestPDApi(Base): "RangeIndex", "UInt64Index", "Series", - "SparseArray", "SparseDtype", "StringDtype", "Timedelta", @@ -80,6 +79,7 @@ class TestPDApi(Base): "PeriodDtype", "IntervalDtype", "DatetimeTZDtype", + "BooleanDtype", "Int8Dtype", "Int16Dtype", "Int32Dtype", @@ -90,17 +90,20 @@ class TestPDApi(Base): "UInt64Dtype", "NamedAgg", ] - if not compat.PY37: - classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal - deprecated_classes = [] # type: List[str] + deprecated_classes: List[str] = [] # these should be deprecated in the future - deprecated_classes_in_future = [] # type: List[str] + deprecated_classes_in_future: List[str] = ["SparseArray"] + + if not compat.PY37: + classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) + # deprecated_modules.extend(["np", "datetime"]) + # deprecated_classes_in_future.extend(["SparseArray"]) # external modules exposed in pandas namespace - modules = ["np", "datetime"] + modules: List[str] = [] # top-level functions funcs = [ @@ -156,7 +159,6 @@ class TestPDApi(Base): "read_hdf", "read_html", "read_json", - "read_msgpack", "read_pickle", "read_sas", "read_sql", @@ -166,17 +168,21 @@ class TestPDApi(Base): "read_table", "read_feather", "read_parquet", + "read_orc", "read_spss", ] + # top-level json funcs + funcs_json = ["json_normalize"] + # top-level to_* funcs - funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] + funcs_to = ["to_datetime", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future - deprecated_funcs_in_future = [] # type: List[str] + deprecated_funcs_in_future: List[str] = [] # these are already deprecated; awaiting removal - deprecated_funcs = [] # type: List[str] + deprecated_funcs: List[str] = [] # private modules in pandas namespace private_modules = [ @@ -188,6 +194,8 @@ class TestPDApi(Base): "_np_version_under1p15", "_np_version_under1p16", "_np_version_under1p17", + "_np_version_under1p18", + "_testing", "_tslib", "_typing", "_version", @@ -195,41 +203,111 @@ class TestPDApi(Base): def test_api(self): - self.check( - pd, + checkthese = ( self.lib + self.misc + self.modules - + self.deprecated_modules + self.classes - + self.deprecated_classes - + self.deprecated_classes_in_future + self.funcs + self.funcs_option + self.funcs_read + + self.funcs_json + self.funcs_to - + self.deprecated_funcs_in_future + + self.private_modules + ) + if not compat.PY37: + checkthese.extend( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.deprecated_funcs_in_future + + self.deprecated_funcs + ) + self.check(pd, checkthese, self.ignored) + + def test_depr(self): + deprecated_list = ( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + self.deprecated_funcs - + self.private_modules, - self.ignored, + + self.deprecated_funcs_in_future ) + for depr in deprecated_list: + with tm.assert_produces_warning(FutureWarning): + deprecated = getattr(pd, depr) + if not compat.PY37: + if depr == "datetime": + deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) + elif depr == "SparseArray": + deprecated([]) + else: + deprecated.__getattr__(dir(deprecated)[-1]) -class TestApi(Base): +def test_datetime(): + from datetime import datetime + import warnings - allowed = ["types", "extensions"] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert datetime(2015, 1, 2, 0, 0) == pd.datetime(2015, 1, 2, 0, 0) - def test_api(self): + assert isinstance(pd.datetime(2015, 1, 2, 0, 0), pd.datetime) + + +def test_sparsearray(): + import warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert isinstance(pd.array([1, 2, 3], dtype="Sparse"), pd.SparseArray) + + +def test_np(): + import numpy as np + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() + + +class TestApi(Base): + allowed = ["types", "extensions", "indexers"] + + def test_api(self): self.check(api, self.allowed) class TestTesting(Base): - - funcs = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] + funcs = [ + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", + "assert_extension_array_equal", + ] def test_testing(self): - from pandas import testing self.check(testing, self.funcs) + + def test_util_testing_deprecated(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + + with tm.assert_produces_warning(FutureWarning) as m: + import pandas.util.testing # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_testing_deprecated_direct(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + with tm.assert_produces_warning(FutureWarning) as m: + from pandas.util.testing import assert_series_equal # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 24f325643479c..31423c03dee34 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -1,5 +1,5 @@ +import pandas._testing as tm from pandas.api import types -import pandas.util.testing as tm from .test_api import Base @@ -18,7 +18,6 @@ class TestTypes(Base): "is_datetime64_ns_dtype", "is_datetime64tz_dtype", "is_dtype_equal", - "is_extension_type", "is_float", "is_float_dtype", "is_int64_dtype", @@ -51,7 +50,7 @@ class TestTypes(Base): "infer_dtype", "is_extension_array_dtype", ] - deprecated = ["is_period", "is_datetimetz"] + deprecated = ["is_extension_type"] dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] def test_types(self): diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py new file mode 100644 index 0000000000000..83d19b8a20ac3 --- /dev/null +++ b/pandas/tests/arithmetic/common.py @@ -0,0 +1,89 @@ +""" +Assertion helpers for arithmetic tests. +""" +import numpy as np +import pytest + +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +def assert_invalid_addsub_type(left, right, msg=None): + """ + Helper to assert that left and right can be neither added nor subtracted. + + Parameters + --------- + left : object + right : object + msg : str or None, default None + """ + with pytest.raises(TypeError, match=msg): + left + right + with pytest.raises(TypeError, match=msg): + right + left + with pytest.raises(TypeError, match=msg): + left - right + with pytest.raises(TypeError, match=msg): + right - left + + +def get_upcast_box(box, vector): + """ + Given two box-types, find the one that takes priority + """ + if box is DataFrame or isinstance(vector, DataFrame): + return DataFrame + if box is Series or isinstance(vector, Series): + return Series + if box is Index or isinstance(vector, Index): + return Index + return box + + +def assert_invalid_comparison(left, right, box): + """ + Assert that comparison operations with mismatched types behave correctly. + + Parameters + ---------- + left : np.ndarray, ExtensionArray, Index, or Series + right : object + box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} + """ + # Not for tznaive-tzaware comparison + + # Note: not quite the same as how we do this for tm.box_expected + xbox = box if box is not Index else np.array + + result = left == right + expected = xbox(np.zeros(result.shape, dtype=np.bool_)) + + tm.assert_equal(result, expected) + + result = right == left + tm.assert_equal(result, expected) + + result = left != right + tm.assert_equal(result, ~expected) + + result = right != left + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + right < left + with pytest.raises(TypeError, match=msg): + right <= left + with pytest.raises(TypeError, match=msg): + right > left + with pytest.raises(TypeError, match=msg): + right >= left diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 774ff14398bdb..577093c0f2967 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # ------------------------------------------------------------------ # Helper Functions @@ -21,7 +21,24 @@ def id_func(x): @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): - # zero-dim integer array behaves like an integer + """ + Several variants of integer value 1. The zero-dim integer array + behaves like an integer. + + This fixture can be used to check that datetimelike indexes handle + addition and subtraction of integers and zero-dimensional arrays + of integers. + + Examples + -------- + >>> dti = pd.date_range('2016-01-01', periods=2, freq='H') + >>> dti + DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'], + dtype='datetime64[ns]', freq='H') + >>> dti + one + DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'], + dtype='datetime64[ns]', freq='H') + """ return request.param @@ -40,8 +57,21 @@ def one(request): @pytest.fixture(params=zeros) def zero(request): - # For testing division by (or of) zero for Index with length 5, this - # gives several scalar-zeros and length-5 vector-zeros + """ + Several types of scalar zeros and length 5 vectors of zeros. + + This fixture can be used to check that numeric-dtype indexes handle + division by any zero numeric-dtype. + + Uses vector of length 5 for broadcasting with `numeric_idx` fixture, + which creates numeric-dtype vectors also of length 5. + + Examples + -------- + >>> arr = pd.RangeIndex(5) + >>> arr / zeros + Float64Index([nan, inf, inf, inf, inf], dtype='float64') + """ return request.param @@ -205,25 +235,6 @@ def box_df_fail(request): return request.param -@pytest.fixture( - params=[ - (pd.Index, False), - (pd.Series, False), - (pd.DataFrame, False), - pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), - (tm.to_array, False), - ], - ids=id_func, -) -def box_transpose_fail(request): - """ - Fixture similar to `box` but testing both transpose cases for DataFrame, - with the tranpose=True case xfailed. - """ - # GH#23620 - return request.param - - @pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) def box_with_array(request): """ diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index d239687a37757..d3f9ac4f3f8b2 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -13,7 +13,7 @@ from pandas._libs.tslibs.conversion import localize_pydatetime from pandas._libs.tslibs.offsets import shift_months from pandas.compat.numpy import np_datetime64_compat -from pandas.errors import NullFrequencyError, PerformanceWarning +from pandas.errors import PerformanceWarning import pandas as pd from pandas import ( @@ -26,57 +26,13 @@ Timestamp, date_range, ) -from pandas.core.indexes.datetimes import _to_M8 -import pandas.util.testing as tm - - -def assert_invalid_comparison(left, right, box): - """ - Assert that comparison operations with mismatched types behave correctly. - - Parameters - ---------- - left : np.ndarray, ExtensionArray, Index, or Series - right : object - box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} - """ - # Not for tznaive-tzaware comparison - - # Note: not quite the same as how we do this for tm.box_expected - xbox = box if box is not pd.Index else np.array - - result = left == right - expected = xbox(np.zeros(result.shape, dtype=np.bool_)) - - tm.assert_equal(result, expected) - - result = right == left - tm.assert_equal(result, expected) - - result = left != right - tm.assert_equal(result, ~expected) - - result = right != left - tm.assert_equal(result, ~expected) - - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - left < right - with pytest.raises(TypeError, match=msg): - left <= right - with pytest.raises(TypeError, match=msg): - left > right - with pytest.raises(TypeError, match=msg): - left >= right - with pytest.raises(TypeError, match=msg): - right < left - with pytest.raises(TypeError, match=msg): - right <= left - with pytest.raises(TypeError, match=msg): - right > left - with pytest.raises(TypeError, match=msg): - right >= left - +import pandas._testing as tm +from pandas.core.ops import roperator +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) # ------------------------------------------------------------------ # Comparisons @@ -102,19 +58,24 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - def test_dt64arr_cmp_date_invalid(self, tz_naive_fixture, box_with_array): - # GH#19800, GH#19301 datetime.date comparison raises to - # match DatetimeIndex/Timestamp. This also matches the behavior - # of stdlib datetime.datetime - tz = tz_naive_fixture - - dti = pd.date_range("20010101", periods=10, tz=tz) - date = dti[0].to_pydatetime().date() - - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, date, box_with_array) - - @pytest.mark.parametrize("other", ["foo", -1, 99, 4.0, object(), timedelta(days=2)]) + @pytest.mark.parametrize( + "other", + [ + "foo", + -1, + 99, + 4.0, + object(), + timedelta(days=2), + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + datetime(2001, 1, 1).date(), + # GH#19301 None and NaN are *not* cast to NaT for comparisons + None, + np.nan, + ], + ) def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): # GH#22074, GH#15966 tz = tz_naive_fixture @@ -123,15 +84,51 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra dtarr = tm.box_expected(rng, box_with_array) assert_invalid_comparison(dtarr, other, box_with_array) - @pytest.mark.parametrize("other", [None, np.nan]) - def test_dt64arr_cmp_na_scalar_invalid( - self, other, tz_naive_fixture, box_with_array - ): - # GH#19301 + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.timedelta_range("1ns", periods=10).array, + np.array(pd.timedelta_range("1ns", periods=10)), + list(pd.timedelta_range("1ns", periods=10)), + pd.timedelta_range("1 Day", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_dt64arr_cmp_arraylike_invalid(self, other, tz_naive_fixture): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - assert_invalid_comparison(dtarr, other, box_with_array) + + dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data + assert_invalid_comparison(dta, other, tm.to_array) + + def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture): + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="h", periods=5, tz=tz)._data + + other = np.array([0, 1, 2, dta[3], pd.Timedelta(days=1)]) + result = dta == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dta != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + dta < other + with pytest.raises(TypeError, match=msg): + dta > other + with pytest.raises(TypeError, match=msg): + dta <= other + with pytest.raises(TypeError, match=msg): + dta >= other def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly @@ -174,9 +171,9 @@ class TestDatetime64SeriesComparison: ], ) @pytest.mark.parametrize("reverse", [True, False]) - @pytest.mark.parametrize("box", [Series, pd.Index]) @pytest.mark.parametrize("dtype", [None, object]) - def test_nat_comparisons(self, dtype, box, reverse, pair): + def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): + box = index_or_series l, r = pair if reverse: # add lhs / rhs switched data @@ -258,15 +255,10 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): tm.assert_equal(left >= NaT, expected) tm.assert_equal(NaT <= left, expected) - def test_series_comparison_scalars(self): + @pytest.mark.parametrize("val", [datetime(2000, 1, 4), datetime(2000, 1, 5)]) + def test_series_comparison_scalars(self, val): series = Series(date_range("1/1/2000", periods=10)) - val = datetime(2000, 1, 4) - result = series > val - expected = Series([x > val for x in series]) - tm.assert_series_equal(result, expected) - - val = series[5] result = series > val expected = Series([x > val for x in series]) tm.assert_series_equal(result, expected) @@ -348,7 +340,7 @@ class TestDatetimeIndexComparisons: def test_comparators(self, op): index = tm.makeDateIndex(100) element = index[len(index) // 2] - element = _to_M8(element) + element = Timestamp(element).to_datetime64() arr = np.array(index) arr_result = op(arr, element) @@ -1020,25 +1012,28 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): # ------------------------------------------------------------- # Other Invalid Addition/Subtraction - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - def test_dt64arr_add_sub_float(self, other, box_with_array): - dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + @pytest.mark.parametrize( + "other", + [ + 3.14, + np.array([2.0, 3.0]), + # GH#13078 datetime +/- Period is invalid + pd.Period("2011-01-01", freq="D"), + ], + ) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) dtarr = tm.box_expected(dti, box_with_array) msg = "|".join( [ "unsupported operand type", "cannot (add|subtract)", + "cannot use operands with types", "ufunc '?(add|subtract)'? cannot use operands with types", ] ) - with pytest.raises(TypeError, match=msg): - dtarr + other - with pytest.raises(TypeError, match=msg): - other + dtarr - with pytest.raises(TypeError, match=msg): - dtarr - other - with pytest.raises(TypeError, match=msg): - other - dtarr + assert_invalid_addsub_type(dtarr, other, msg) @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) @pytest.mark.parametrize("dti_freq", [None, "D"]) @@ -1059,32 +1054,7 @@ def test_dt64arr_add_sub_parr( "ufunc.*cannot use operands", ] ) - with pytest.raises(TypeError, match=msg): - dtarr + parr - with pytest.raises(TypeError, match=msg): - parr + dtarr - with pytest.raises(TypeError, match=msg): - dtarr - parr - with pytest.raises(TypeError, match=msg): - parr - dtarr - - @pytest.mark.parametrize("dti_freq", [None, "D"]) - def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array): - # GH#13078 - # not supported, check TypeError - per = pd.Period("2011-01-01", freq="D") - - idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) - dtarr = tm.box_expected(idx, box_with_array) - msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) - with pytest.raises(TypeError, match=msg): - dtarr + per - with pytest.raises(TypeError, match=msg): - per + dtarr - with pytest.raises(TypeError, match=msg): - dtarr - per - with pytest.raises(TypeError, match=msg): - per - dtarr + assert_invalid_addsub_type(dtarr, parr, msg) class TestDatetime64DateOffsetArithmetic: @@ -1406,7 +1376,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other @@ -1435,7 +1405,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1443,11 +1413,11 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn): res = dtarr - other expected = DatetimeIndex( [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1860,6 +1830,7 @@ def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): with pytest.raises(TypeError, match=msg): one / dt64_series + # TODO: parametrize over box @pytest.mark.parametrize("op", ["__add__", "__radd__", "__sub__", "__rsub__"]) @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_dt64_series_add_intlike(self, tz, op): @@ -1872,10 +1843,8 @@ def test_dt64_series_add_intlike(self, tz, op): method = getattr(ser, op) msg = "|".join( [ - "incompatible type for a .* operation", - "cannot evaluate a numeric op", - "ufunc .* cannot use operands", - "cannot (add|subtract)", + "Addition/subtraction of integers and integer-arrays", + "cannot subtract .* from ndarray", ] ) with pytest.raises(TypeError, match=msg): @@ -1957,38 +1926,20 @@ class TestDatetimeIndexArithmetic: # ------------------------------------------------------------- # Binary operations DatetimeIndex and int - def test_dti_add_int(self, tz_naive_fixture, one): + def test_dti_addsub_int(self, tz_naive_fixture, one): # Variants of `one` for #19012 tz = tz_naive_fixture rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = rng + one - expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) - tm.assert_index_equal(result, expected) + msg = "Addition/subtraction of integers" - def test_dti_iadd_int(self, tz_naive_fixture, one): - tz = tz_naive_fixture - rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) - expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(TypeError, match=msg): + rng + one + with pytest.raises(TypeError, match=msg): rng += one - tm.assert_index_equal(rng, expected) - - def test_dti_sub_int(self, tz_naive_fixture, one): - tz = tz_naive_fixture - rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = rng - one - expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) - tm.assert_index_equal(result, expected) - - def test_dti_isub_int(self, tz_naive_fixture, one): - tz = tz_naive_fixture - rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) - expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(TypeError, match=msg): + rng - one + with pytest.raises(TypeError, match=msg): rng -= one - tm.assert_index_equal(rng, expected) # ------------------------------------------------------------- # __add__/__sub__ with integer arrays @@ -2000,14 +1951,8 @@ def test_dti_add_intarray_tick(self, int_holder, freq): dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) - result = dti + other - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = other + dti - tm.assert_index_equal(result, expected) + msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + assert_invalid_addsub_type(dti, other, msg) @pytest.mark.parametrize("freq", ["W", "M", "MS", "Q"]) @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) @@ -2016,37 +1961,18 @@ def test_dti_add_intarray_non_tick(self, int_holder, freq): dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) - - # tm.assert_produces_warning does not handle cases where we expect - # two warnings, in this case PerformanceWarning and FutureWarning. - # Until that is fixed, we don't catch either - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - result = dti + other - tm.assert_index_equal(result, expected) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - result = other + dti - tm.assert_index_equal(result, expected) + msg = "Addition/subtraction of integers|cannot subtract DatetimeArray from" + assert_invalid_addsub_type(dti, other, msg) @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_no_freq(self, int_holder): # GH#19959 dti = pd.DatetimeIndex(["2016-01-01", "NaT", "2017-04-05 06:07:08"]) other = int_holder([9, 4, -1]) - nfmsg = "Cannot shift with no freq" - tmsg = "cannot subtract DatetimeArray from" - with pytest.raises(NullFrequencyError, match=nfmsg): - dti + other - with pytest.raises(NullFrequencyError, match=nfmsg): - other + dti - with pytest.raises(NullFrequencyError, match=nfmsg): - dti - other - with pytest.raises(TypeError, match=tmsg): - other - dti + msg = "|".join( + ["cannot subtract DatetimeArray from", "Addition/subtraction of integers"] + ) + assert_invalid_addsub_type(dti, other, msg) # ------------------------------------------------------------- # Binary operations DatetimeIndex and TimedeltaIndex/array @@ -2168,16 +2094,16 @@ def test_dti_isub_tdi(self, tz_naive_fixture): ids=lambda x: type(x).__name__, ) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) - def test_add_datetimelike_and_dti(self, addend, tz): + def test_add_datetimelike_and_dtarr(self, box_with_array, addend, tz): # GH#9631 dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) - msg = ( - "cannot add DatetimeArray and {0}".format(type(addend).__name__) - ).replace("DatetimeIndex", "DatetimeArray") + dtarr = tm.box_expected(dti, box_with_array) + msg = "cannot add DatetimeArray and" + with pytest.raises(TypeError, match=msg): - dti + addend + dtarr + addend with pytest.raises(TypeError, match=msg): - addend + dti + addend + dtarr # ------------------------------------------------------------- @@ -2257,13 +2183,6 @@ def test_timedelta64_equal_timedelta_supported_ops(self, op): intervals = ["D", "h", "m", "s", "us"] - # TODO: unused - # npy16_mappings = {'D': 24 * 60 * 60 * 1000000, - # 'h': 60 * 60 * 1000000, - # 'm': 60 * 1000000, - # 's': 1000000, - # 'us': 1} - def timedelta64(*args): # see casting notes in NumPy gh-12927 return np.sum(list(starmap(np.timedelta64, zip(args, intervals)))) @@ -2406,82 +2325,57 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) + @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub]) @pytest.mark.parametrize( "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] ) - def test_dti_add_offset_index(self, tz_naive_fixture, names): + def test_dti_addsub_offset_arraylike( + self, tz_naive_fixture, names, op, index_or_series + ): # GH#18849, GH#19744 + box = pd.Index + other_box = index_or_series + tz = tz_naive_fixture dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) + other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - expected = DatetimeIndex( - [dti[n] + other[n] for n in range(len(dti))], name=names[2], freq="infer" - ) - tm.assert_index_equal(res, expected) + xbox = get_upcast_box(box, other) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_index_equal(res2, expected) - - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_sub_offset_index(self, tz_naive_fixture, names): - # GH#18824, GH#19744 - tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) + with tm.assert_produces_warning(PerformanceWarning): + res = op(dti, other) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti - other expected = DatetimeIndex( - [dti[n] - other[n] for n in range(len(dti))], name=names[2], freq="infer" + [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer" ) - tm.assert_index_equal(res, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(res, expected) - @pytest.mark.parametrize( - "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] - ) - def test_dti_with_offset_series(self, tz_naive_fixture, names): - # GH#18849 + @pytest.mark.parametrize("other_box", [pd.Index, np.array]) + def test_dti_addsub_object_arraylike( + self, tz_naive_fixture, box_with_array, other_box + ): tz = tz_naive_fixture - dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) - other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - expected_add = Series( - [dti[n] + other[n] for n in range(len(dti))], name=names[2] - ) + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + other = other_box([pd.offsets.MonthEnd(), pd.Timedelta(days=4)]) + xbox = get_upcast_box(box_with_array, other) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res = dti + other - tm.assert_series_equal(res, expected_add) + expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res2 = other + dti - tm.assert_series_equal(res2, expected_add) + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + result = dtarr + other + tm.assert_equal(result, expected) - expected_sub = Series( - [dti[n] - other[n] for n in range(len(dti))], name=names[2] - ) + expected = pd.DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning( - PerformanceWarning, clear=[pd.core.arrays.datetimelike] - ): - res3 = dti - other - tm.assert_series_equal(res3, expected_sub) + with tm.assert_produces_warning(warn): + result = dtarr - other + tm.assert_equal(result, expected) @pytest.mark.parametrize("years", [-1, 0, 1]) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py new file mode 100644 index 0000000000000..f9e1a515277d5 --- /dev/null +++ b/pandas/tests/arithmetic/test_interval.py @@ -0,0 +1,273 @@ +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_list_like + +import pandas as pd +from pandas import ( + Categorical, + Index, + Interval, + IntervalIndex, + Period, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])), + (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])), + ( + timedelta_range("0 days", periods=3).insert(4, pd.NaT), + timedelta_range("1 day", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3).insert(4, pd.NaT), + date_range("20170102", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT), + date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +@pytest.fixture +def array(left_right_dtypes): + """ + Fixture to generate an IntervalArray of various dtypes containing NA if possible + """ + left, right = left_right_dtypes + return IntervalArray.from_arrays(left, right) + + +def create_categorical_intervals(left, right, closed="right"): + return Categorical(IntervalIndex.from_arrays(left, right, closed)) + + +def create_series_intervals(left, right, closed="right"): + return Series(IntervalArray.from_arrays(left, right, closed)) + + +def create_series_categorical_intervals(left, right, closed="right"): + return Series(Categorical(IntervalIndex.from_arrays(left, right, closed))) + + +class TestComparison: + @pytest.fixture(params=[operator.eq, operator.ne]) + def op(self, request): + return request.param + + @pytest.fixture( + params=[ + IntervalArray.from_arrays, + IntervalIndex.from_arrays, + create_categorical_intervals, + create_series_intervals, + create_series_categorical_intervals, + ], + ids=[ + "IntervalArray", + "IntervalIndex", + "Categorical[Interval]", + "Series[Interval]", + "Series[Categorical[Interval]]", + ], + ) + def interval_constructor(self, request): + """ + Fixture for all pandas native interval constructors. + To be used as the LHS of IntervalArray comparisons. + """ + return request.param + + def elementwise_comparison(self, op, array, other): + """ + Helper that performs elementwise comparisions between `array` and `other` + """ + other = other if is_list_like(other) else [other] * len(array) + return np.array([op(x, y) for x, y in zip(array, other)]) + + def test_compare_scalar_interval(self, op, array): + # matches first interval + other = array[0] + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # matches on a single endpoint but not both + other = Interval(array.left[0], array.right[1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = Interval(0, 1, closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_na(self, op, array, nulls_fixture): + result = op(array, nulls_fixture) + expected = self.elementwise_comparison(op, array, nulls_fixture) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + 0, + 1.0, + True, + "foo", + Timestamp("2017-01-01"), + Timestamp("2017-01-01", tz="US/Eastern"), + Timedelta("0 days"), + Period("2017-01-01", "D"), + ], + ) + def test_compare_scalar_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval( + self, op, array, interval_constructor, + ): + # same endpoints + other = interval_constructor(array.left, array.right) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # different endpoints + other = interval_constructor(array.left[::-1], array.right[::-1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # all nan endpoints + other = interval_constructor([np.nan] * 4, [np.nan] * 4) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval_mixed_closed( + self, op, interval_constructor, closed, other_closed + ): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = interval_constructor(range(2), range(1, 3), closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + ( + Interval(0, 1), + Interval(Timedelta("1 day"), Timedelta("2 days")), + Interval(4, 5, "both"), + Interval(10, 20, "neither"), + ), + (0, 1.5, Timestamp("20170103"), np.nan), + ( + Timestamp("20170102", tz="US/Eastern"), + Timedelta("2 days"), + "baz", + pd.NaT, + ), + ], + ) + def test_compare_list_like_object(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_nan(self, op, array, nulls_fixture): + other = [nulls_fixture] * 4 + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + np.arange(4, dtype="int64"), + np.arange(4, dtype="float64"), + date_range("2017-01-01", periods=4), + date_range("2017-01-01", periods=4, tz="US/Eastern"), + timedelta_range("0 days", periods=4), + period_range("2017-01-01", periods=4, freq="D"), + Categorical(list("abab")), + Categorical(date_range("2017-01-01", periods=4)), + pd.array(list("abcd")), + pd.array(["foo", 3.14, None, object()]), + ], + ids=lambda x: str(x.dtype), + ) + def test_compare_list_like_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("length", [1, 3, 5]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, list]) + def test_compare_length_mismatch_errors(self, op, other_constructor, length): + array = IntervalArray.from_arrays(range(4), range(1, 5)) + other = other_constructor([Interval(0, 1)] * length) + with pytest.raises(ValueError, match="Lengths must match to compare"): + op(array, other) + + @pytest.mark.parametrize( + "constructor, expected_type, assert_func", + [ + (IntervalIndex, np.array, tm.assert_numpy_array_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_index_series_compat(self, op, constructor, expected_type, assert_func): + # IntervalIndex/Series that rely on IntervalArray for comparisons + breaks = range(4) + index = constructor(IntervalIndex.from_breaks(breaks)) + + # scalar comparisons + other = index[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = breaks[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + # list-like comparisons + other = IntervalArray.from_breaks(breaks) + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = [index[0], breaks[0], "foo"] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 584e22f8488f5..f55e2b98ee912 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -5,14 +5,15 @@ from decimal import Decimal from itertools import combinations import operator +from typing import Any, List import numpy as np import pytest import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm def adjust_negative_zero(zero, expected): @@ -30,6 +31,19 @@ def adjust_negative_zero(zero, expected): return expected +# TODO: remove this kludge once mypy stops giving false positives here +# List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex] +# See GH#29725 +ser_or_index: List[Any] = [pd.Series, pd.Index] +lefts: List[Any] = [pd.RangeIndex(10, 40, 10)] +lefts.extend( + [ + cls([10, 20, 30], dtype=dtype) + for dtype in ["i1", "i2", "i4", "i8", "u1", "u2", "u4", "u8", "f2", "f4", "f8"] + for cls in ser_or_index + ] +) + # ------------------------------------------------------------------ # Comparisons @@ -51,13 +65,16 @@ def test_df_numeric_cmp_dt64_raises(self): # GH#8932, GH#22163 ts = pd.Timestamp.now() df = pd.DataFrame({"x": range(5)}) - with pytest.raises(TypeError): + + msg = "Invalid comparison between dtype=int64 and Timestamp" + + with pytest.raises(TypeError, match=msg): df > ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): df < ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts < df - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts > df assert not (df == ts).any().any() @@ -81,26 +98,7 @@ class TestNumericArraylikeArithmeticWithDatetimeLike: # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", - [pd.RangeIndex(10, 40, 10)] - + [ - cls([10, 20, 30], dtype=dtype) - for dtype in [ - "i1", - "i2", - "i4", - "i8", - "u1", - "u2", - "u4", - "u8", - "f2", - "f4", - "f8", - ] - for cls in [pd.Series, pd.Index] - ], - ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), ) def test_mul_td64arr(self, left, box_cls): # GH#22390 @@ -120,26 +118,7 @@ def test_mul_td64arr(self, left, box_cls): # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", - [pd.RangeIndex(10, 40, 10)] - + [ - cls([10, 20, 30], dtype=dtype) - for dtype in [ - "i1", - "i2", - "i4", - "i8", - "u1", - "u2", - "u4", - "u8", - "f2", - "f4", - "f8", - ] - for cls in [pd.Series, pd.Index] - ], - ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), ) def test_div_td64arr(self, left, box_cls): # GH#22390 diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index f9c1de115b3a4..799ef3492e53f 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import Series, Timestamp +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm # ------------------------------------------------------------------ # Comparisons diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ed693d873efb8..abb667260f094 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -11,12 +11,14 @@ import pandas as pd from pandas import Period, PeriodIndex, Series, period_range +import pandas._testing as tm from pandas.core import ops from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm from pandas.tseries.frequencies import to_offset +from .common import assert_invalid_comparison + # ------------------------------------------------------------------ # Comparisons @@ -39,11 +41,93 @@ def test_compare_zerodim(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) + @pytest.mark.parametrize( + "scalar", ["foo", pd.Timestamp.now(), pd.Timedelta(days=4)] + ) + def test_compare_invalid_scalar(self, box_with_array, scalar): + # comparison with scalar that cannot be interpreted as a Period + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, scalar, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("1D", periods=4).array, + np.arange(4), + np.arange(4).astype(np.float64), + list(range(4)), + ], + ) + def test_compare_invalid_listlike(self, box_with_array, other): + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, other, box_with_array) + + @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) + def test_compare_object_dtype(self, box_with_array, other_box): + pi = pd.period_range("2000", periods=5) + parr = tm.box_expected(pi, box_with_array) + + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + other = other_box(pi) + + expected = np.array([True, True, True, True, True]) + expected = tm.box_expected(expected, xbox) + + result = parr == other + tm.assert_equal(result, expected) + result = parr <= other + tm.assert_equal(result, expected) + result = parr >= other + tm.assert_equal(result, expected) + + result = parr != other + tm.assert_equal(result, ~expected) + result = parr < other + tm.assert_equal(result, ~expected) + result = parr > other + tm.assert_equal(result, ~expected) + + other = other_box(pi[::-1]) + + expected = np.array([False, False, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr == other + tm.assert_equal(result, expected) + + expected = np.array([True, True, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr <= other + tm.assert_equal(result, expected) + + expected = np.array([False, False, True, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr >= other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr != other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr < other + tm.assert_equal(result, expected) + + expected = np.array([False, False, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr > other + tm.assert_equal(result, expected) + class TestPeriodIndexComparisons: # TODO: parameterize over boxes - @pytest.mark.parametrize("other", ["2017", 2017]) + @pytest.mark.parametrize("other", ["2017", pd.Period("2017", freq="D")]) def test_eq(self, other): idx = PeriodIndex(["2017", "2017", "2018"], freq="D") expected = np.array([True, True, False]) @@ -51,6 +135,34 @@ def test_eq(self, other): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "other", + [ + 2017, + [2017, 2017, 2017], + np.array([2017, 2017, 2017]), + np.array([2017, 2017, 2017], dtype=object), + pd.Index([2017, 2017, 2017]), + ], + ) + def test_eq_integer_disallowed(self, other): + # match Period semantics by not treating integers as Periods + + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + expected = np.array([False, False, False]) + result = idx == other + + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises(TypeError): + idx < other + with pytest.raises(TypeError): + idx > other + with pytest.raises(TypeError): + idx <= other + with pytest.raises(TypeError): + idx >= other + def test_pi_cmp_period(self): idx = period_range("2007-01", periods=20, freq="M") @@ -168,9 +280,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # TODO: Could parametrize over boxes for idx? idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = ( - r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=A-DEC\)" - ) + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=A-DEC\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -184,7 +294,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): Period("2011", freq="4M") >= base idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") - rev_msg = r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=4M\)" + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=4M\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -755,18 +865,18 @@ def test_pi_sub_isub_offset(self): rng -= pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_offset_n_gt1(self, box_transpose_fail): + @pytest.mark.parametrize("transpose", [True, False]) + def test_pi_add_offset_n_gt1(self, box_with_array, transpose): # GH#23215 # add offset to PeriodIndex with freq.n > 1 - box, transpose = box_transpose_fail per = pd.Period("2016-01", freq="2M") pi = pd.PeriodIndex([per]) expected = pd.PeriodIndex(["2016-03"], freq="2M") - pi = tm.box_expected(pi, box, transpose=transpose) - expected = tm.box_expected(expected, box, transpose=transpose) + pi = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) result = pi + per.freq tm.assert_equal(result, expected) @@ -984,16 +1094,15 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): with pytest.raises(IncompatibleFrequency, match=msg): rng -= other - def test_parr_add_sub_td64_nat(self, box_transpose_fail): + @pytest.mark.parametrize("transpose", [True, False]) + def test_parr_add_sub_td64_nat(self, box_with_array, transpose): # GH#23320 special handling for timedelta64("NaT") - box, transpose = box_transpose_fail - pi = pd.period_range("1994-04-01", periods=9, freq="19D") other = np.timedelta64("NaT") expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") - obj = tm.box_expected(pi, box, transpose=transpose) - expected = tm.box_expected(expected, box, transpose=transpose) + obj = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) result = obj + other tm.assert_equal(result, expected) @@ -1011,16 +1120,12 @@ def test_parr_add_sub_td64_nat(self, box_transpose_fail): TimedeltaArray._from_sequence(["NaT"] * 9), ], ) - def test_parr_add_sub_tdt64_nat_array(self, box_df_fail, other): - # FIXME: DataFrame fails because when when operating column-wise - # timedelta64 entries become NaT and are treated like datetimes - box = box_df_fail - + def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): pi = pd.period_range("1994-04-01", periods=9, freq="19D") expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") - obj = tm.box_expected(pi, box) - expected = tm.box_expected(expected, box) + obj = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = obj + other tm.assert_equal(result, expected) @@ -1043,6 +1148,26 @@ def test_parr_add_sub_index(self): expected = pi - pi tm.assert_index_equal(result, expected) + def test_parr_add_sub_object_array(self): + pi = pd.period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([pd.Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = pd.PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + ).array + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = pd.PeriodIndex(["2000-12-30"] * 3, freq="D").array + tm.assert_equal(result, expected) + class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index ecb07fa49036a..158da37aa7239 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas.errors import NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning +from pandas.errors import OutOfBoundsDatetime, PerformanceWarning import pandas as pd from pandas import ( @@ -18,22 +18,12 @@ Timestamp, timedelta_range, ) -from pandas.tests.arithmetic.test_datetime64 import assert_invalid_comparison -import pandas.util.testing as tm - - -def get_upcast_box(box, vector): - """ - Given two box-types, find the one that takes priority - """ - if box is DataFrame or isinstance(vector, DataFrame): - return DataFrame - if box is Series or isinstance(vector, Series): - return Series - if box is pd.Index or isinstance(vector, pd.Index): - return pd.Index - return box - +import pandas._testing as tm +from pandas.tests.arithmetic.common import ( + assert_invalid_addsub_type, + assert_invalid_comparison, + get_upcast_box, +) # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons @@ -86,6 +76,49 @@ def test_td64_comparisons_invalid(self, box_with_array, invalid): assert_invalid_comparison(obj, invalid, box) + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.date_range("1970-01-01", periods=10, tz="UTC").array, + np.array(pd.date_range("1970-01-01", periods=10)), + list(pd.date_range("1970-01-01", periods=10)), + pd.date_range("1970-01-01", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_td64arr_cmp_arraylike_invalid(self, other): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + + rng = timedelta_range("1 days", periods=10)._data + assert_invalid_comparison(rng, other, tm.to_array) + + def test_td64arr_cmp_mixed_invalid(self): + rng = timedelta_range("1 days", periods=5)._data + + other = np.array([0, 1, 2, rng[3], pd.Timestamp.now()]) + result = rng == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + rng < other + with pytest.raises(TypeError, match=msg): + rng > other + with pytest.raises(TypeError, match=msg): + rng <= other + with pytest.raises(TypeError, match=msg): + rng >= other + class TestTimedelta64ArrayComparisons: # TODO: All of these need to be parametrized over box @@ -409,7 +442,7 @@ def test_addition_ops(self): tdi[0:1] + dti # random indexes - with pytest.raises(NullFrequencyError): + with pytest.raises(TypeError): tdi + pd.Int64Index([1, 2, 3]) # this is a union! @@ -484,6 +517,62 @@ def test_tda_add_sub_index(self): expected = tdi - tdi tm.assert_index_equal(result, expected) + # ------------------------------------------------------------- + # Binary operations TimedeltaIndex and timedelta-like + + def test_tdi_iadd_timedeltalike(self, two_hours): + # only test adding/sub offsets as + is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") + rng += two_hours + tm.assert_index_equal(rng, expected) + + def test_tdi_isub_timedeltalike(self, two_hours): + # only test adding/sub offsets as - is now numeric + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") + rng -= two_hours + tm.assert_index_equal(rng, expected) + + # ------------------------------------------------------------- + + def test_tdi_ops_attributes(self): + rng = timedelta_range("2 days", periods=5, freq="2D", name="x") + + result = rng + 1 * rng.freq + exp = timedelta_range("4 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng - 2 * rng.freq + exp = timedelta_range("-2 days", periods=5, freq="2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "2D" + + result = rng * 2 + exp = timedelta_range("4 days", periods=5, freq="4D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "4D" + + result = rng / 2 + exp = timedelta_range("1 days", periods=5, freq="D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "D" + + result = -rng + exp = timedelta_range("-2 days", periods=5, freq="-2D", name="x") + tm.assert_index_equal(result, exp) + assert result.freq == "-2D" + + rng = pd.timedelta_range("-2 days", periods=5, freq="D", name="x") + + result = abs(rng) + exp = TimedeltaIndex( + ["2 days", "1 days", "0 days", "1 days", "2 days"], name="x" + ) + tm.assert_index_equal(result, exp) + assert result.freq is None + class TestAddSubNaTMasking: # TODO: parametrize over boxes @@ -555,37 +644,29 @@ def test_tdi_add_overflow(self): class TestTimedeltaArraylikeAddSubOps: # Tests for timedelta64[ns] __add__, __sub__, __radd__, __rsub__ - # TODO: moved from frame tests; needs parametrization/de-duplication - def test_td64_df_add_int_frame(self): - # GH#22696 Check that we don't dispatch to numpy implementation, - # which treats int64 as m8[ns] - tdi = pd.timedelta_range("1", periods=3) - df = tdi.to_frame() - other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df` - with pytest.raises(TypeError): - df + other - with pytest.raises(TypeError): - other + df - with pytest.raises(TypeError): - df - other - with pytest.raises(TypeError): - other - df - # TODO: moved from tests.indexes.timedeltas.test_arithmetic; needs # parametrization+de-duplication def test_timedelta_ops_with_missing_values(self): # setup s1 = pd.to_timedelta(Series(["00:00:01"])) s2 = pd.to_timedelta(Series(["00:00:02"])) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # Passing datetime64-dtype data to TimedeltaIndex is deprecated - sn = pd.to_timedelta(Series([pd.NaT])) + + msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" + with pytest.raises(TypeError, match=msg): + # Passing datetime64-dtype data to TimedeltaIndex is no longer + # supported GH#29794 + pd.to_timedelta(Series([pd.NaT])) + + sn = pd.to_timedelta(Series([pd.NaT], dtype="m8[ns]")) df1 = pd.DataFrame(["00:00:01"]).apply(pd.to_timedelta) df2 = pd.DataFrame(["00:00:02"]).apply(pd.to_timedelta) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # Passing datetime64-dtype data to TimedeltaIndex is deprecated - dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + with pytest.raises(TypeError, match=msg): + # Passing datetime64-dtype data to TimedeltaIndex is no longer + # supported GH#29794 + pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) + + dfn = pd.DataFrame([pd.NaT.value]).apply(pd.to_timedelta) scalar1 = pd.to_timedelta("00:00:01") scalar2 = pd.to_timedelta("00:00:02") @@ -817,51 +898,6 @@ def test_timedelta64_ops_nat(self): tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")])) tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta) - # ------------------------------------------------------------- - # Invalid Operations - - @pytest.mark.parametrize("other", ["a", 3.14, np.array([2.0, 3.0])]) - def test_td64arr_add_sub_invalid(self, box_with_array, other): - # GH#13624 for str - tdi = TimedeltaIndex(["1 day", "2 days"]) - tdarr = tm.box_expected(tdi, box_with_array) - - with pytest.raises(TypeError): - tdarr + other - with pytest.raises(TypeError): - other + tdarr - with pytest.raises(TypeError): - tdarr - other - with pytest.raises(TypeError): - other - tdarr - - @pytest.mark.parametrize("freq", [None, "H"]) - def test_td64arr_sub_period(self, box_with_array, freq): - # GH#13078 - # not supported, check TypeError - p = pd.Period("2011-01-01", freq="D") - idx = TimedeltaIndex(["1 hours", "2 hours"], freq=freq) - idx = tm.box_expected(idx, box_with_array) - - with pytest.raises(TypeError): - idx - p - - with pytest.raises(TypeError): - p - idx - - @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) - @pytest.mark.parametrize("tdi_freq", [None, "H"]) - def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq): - # GH#20049 subtracting PeriodIndex should raise TypeError - tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) - dti = Timestamp("2018-03-07 17:16:40") + tdi - pi = dti.to_period(pi_freq) - - # TODO: parametrize over box for pi? - tdi = tm.box_expected(tdi, box_with_array) - with pytest.raises(TypeError): - tdi - pi - # ------------------------------------------------------------- # Binary operations td64 arraylike and datetime-like @@ -896,11 +932,16 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): result = other + idx tm.assert_equal(result, expected) - def test_td64arr_add_sub_timestamp(self, box_with_array): - # GH#11925 - ts = Timestamp("2012-01-01") - # TODO: parametrize over types of datetime scalar? - + @pytest.mark.parametrize( + "ts", + [ + Timestamp("2012-01-01"), + Timestamp("2012-01-01").to_pydatetime(), + Timestamp("2012-01-01").to_datetime64(), + ], + ) + def test_td64arr_add_sub_datetimelike_scalar(self, ts, box_with_array): + # GH#11925, GH#29558 tdi = timedelta_range("1 day", periods=3) expected = pd.date_range("2012-01-02", periods=3) @@ -963,82 +1004,109 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): tm.assert_equal(other + tdser, expected) # ------------------------------------------------------------------ - # Operations with int-like others + # Invalid __add__/__sub__ operations + + # TODO: moved from frame tests; needs parametrization/de-duplication + def test_td64_df_add_int_frame(self): + # GH#22696 Check that we don't dispatch to numpy implementation, + # which treats int64 as m8[ns] + tdi = pd.timedelta_range("1", periods=3) + df = tdi.to_frame() + other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df` + assert_invalid_addsub_type(df, other) + + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("tdi_freq", [None, "H"]) + def test_td64arr_sub_periodlike(self, box_with_array, tdi_freq, pi_freq): + # GH#20049 subtracting PeriodIndex should raise TypeError + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi + pi = dti.to_period(pi_freq) + + # TODO: parametrize over box for pi? + tdi = tm.box_expected(tdi, box_with_array) + with pytest.raises(TypeError): + tdi - pi + + # FIXME: don't leave commented-out + # FIXME: this raises with period scalar but not with PeriodIndex? + # with pytest.raises(TypeError): + # pi - tdi + + # GH#13078 subtraction of Period scalar not supported + with pytest.raises(TypeError): + tdi - pi[0] + with pytest.raises(TypeError): + pi[0] - tdi @pytest.mark.parametrize( "other", [ + # GH#12624 for str case + "a", # GH#19123 1, - Series([20, 30, 40], dtype="uint8"), - np.array([20, 30, 40], dtype="uint8"), - pd.UInt64Index([20, 30, 40]), - pd.Int64Index([20, 30, 40]), - Series([2, 3, 4]), 1.5, np.array(2), ], ) - def test_td64arr_addsub_numeric_invalid(self, box_with_array, other): - box = box_with_array + def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other): + # vector-like others are tested in test_td64arr_add_sub_numeric_arr_invalid tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - tdser = tm.box_expected(tdser, box) + tdarr = tm.box_expected(tdser, box_with_array) - err = TypeError - if box in [pd.Index, tm.to_array] and not isinstance(other, float): - err = NullFrequencyError + assert_invalid_addsub_type(tdarr, other) - with pytest.raises(err): - tdser + other - with pytest.raises(err): - other + tdser - with pytest.raises(err): - tdser - other - with pytest.raises(err): - other - tdser - - @pytest.mark.parametrize( - "dtype", - [ - "int64", - "int32", - "int16", - "uint64", - "uint32", - "uint16", - "uint8", - "float64", - "float32", - "float16", - ], - ) @pytest.mark.parametrize( "vec", [ np.array([1, 2, 3]), pd.Index([1, 2, 3]), - Series([1, 2, 3]) - # TODO: Add DataFrame in here? + Series([1, 2, 3]), + DataFrame([[1, 2, 3]]), ], ids=lambda x: type(x).__name__, ) - def test_td64arr_add_sub_numeric_arr_invalid(self, box_with_array, vec, dtype): - box = box_with_array + def test_td64arr_addsub_numeric_arr_invalid( + self, box_with_array, vec, any_real_dtype + ): tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - tdser = tm.box_expected(tdser, box) - err = TypeError - if box in [pd.Index, tm.to_array] and not dtype.startswith("float"): - err = NullFrequencyError - - vector = vec.astype(dtype) - with pytest.raises(err): - tdser + vector - with pytest.raises(err): - vector + tdser - with pytest.raises(err): - tdser - vector - with pytest.raises(err): - vector - tdser + tdarr = tm.box_expected(tdser, box_with_array) + + vector = vec.astype(any_real_dtype) + assert_invalid_addsub_type(tdarr, vector) + + def test_td64arr_add_sub_int(self, box_with_array, one): + # Variants of `one` for #19012, deprecated GH#22535 + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + tdarr = tm.box_expected(rng, box_with_array) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, one, msg) + + # TOOD: get inplace ops into assert_invalid_addsub_type + with pytest.raises(TypeError, match=msg): + tdarr += one + with pytest.raises(TypeError, match=msg): + tdarr -= one + + def test_td64arr_add_sub_integer_array(self, box_with_array): + # GH#19959, deprecated GH#22535 + rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) + tdarr = tm.box_expected(rng, box_with_array) + other = tm.box_expected([4, 3, 2], box_with_array) + + msg = "Addition/subtraction of integers and integer-arrays" + assert_invalid_addsub_type(tdarr, other, msg) + + def test_td64arr_addsub_integer_array_no_freq(self, box_with_array): + # GH#19959 + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) + tdarr = tm.box_expected(tdi, box_with_array) + other = tm.box_expected([14, -1, 16], box_with_array) + + msg = "Addition/subtraction of integers" + assert_invalid_addsub_type(tdarr, other, msg) # ------------------------------------------------------------------ # Operations with timedelta-like others @@ -1444,6 +1512,40 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): with tm.assert_produces_warning(PerformanceWarning): anchored - tdi + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + tdi = pd.timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box_with_array) + + other = np.array( + [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] + ) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = tdarr + other + + expected = pd.Index( + [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + with tm.assert_produces_warning(warn): + tdarr - other + + with tm.assert_produces_warning(warn): + result = other - tdarr + + expected = pd.Index( + [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + class TestTimedeltaArraylikeMulDivOps: # Tests for timedelta64[ns] @@ -1869,7 +1971,6 @@ def test_td64arr_mul_td64arr_raises(self, box_with_array): # ------------------------------------------------------------------ # Operations with numeric others - @pytest.mark.parametrize("one", [1, np.array(1), 1.0, np.array(1.0)]) def test_td64arr_mul_numeric_scalar(self, box_with_array, one): # GH#4521 # divide/multiply by integers @@ -1908,33 +2009,18 @@ def test_td64arr_div_numeric_scalar(self, box_with_array, two): with pytest.raises(TypeError, match="Cannot divide"): two / tdser - @pytest.mark.parametrize( - "dtype", - [ - "int64", - "int32", - "int16", - "uint64", - "uint32", - "uint16", - "uint8", - "float64", - "float32", - "float16", - ], - ) @pytest.mark.parametrize( "vector", [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], ids=lambda x: type(x).__name__, ) - def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): + def test_td64arr_rmul_numeric_array(self, box_with_array, vector, any_real_dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - vector = vector.astype(dtype) + vector = vector.astype(any_real_dtype) expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") @@ -1947,32 +2033,19 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): result = vector * tdser tm.assert_equal(result, expected) - @pytest.mark.parametrize( - "dtype", - [ - "int64", - "int32", - "int16", - "uint64", - "uint32", - "uint16", - "uint8", - "float64", - "float32", - "float16", - ], - ) @pytest.mark.parametrize( "vector", [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], ids=lambda x: type(x).__name__, ) - def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): + def test_td64arr_div_numeric_array(self, box_with_array, vector, any_real_dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - vector = vector.astype(dtype) + vector = vector.astype(any_real_dtype) + expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1508fef86ae62..52640044565fc 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("ordered", [True, False]) @@ -11,23 +11,23 @@ def test_factorize(categories, ordered): cat = pd.Categorical( ["b", "b", "a", "c", None], categories=categories, ordered=ordered ) - labels, uniques = pd.factorize(cat) - expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp) + codes, uniques = pd.factorize(cat) + expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp) expected_uniques = pd.Categorical( ["b", "a", "c"], categories=categories, ordered=ordered ) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort(): cat = pd.Categorical(["b", "b", None, "a"]) - labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([1, 1, -1, 0], dtype=np.intp) expected_uniques = pd.Categorical(["a", "b"]) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) @@ -36,13 +36,13 @@ def test_factorized_sort_ordered(): ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True ) - labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([0, 0, -1, 1], dtype=np.intp) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([0, 0, -1, 1], dtype=np.intp) expected_uniques = pd.Categorical( ["b", "a"], categories=["c", "b", "a"], ordered=True ) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) @@ -59,7 +59,29 @@ def test_isin_cats(): tm.assert_numpy_array_equal(expected, result) -@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) +@pytest.mark.parametrize( + "to_replace, value, result, expected_error_msg", + [ + ("b", "c", ["a", "c"], "Categorical.categories are different"), + ("c", "d", ["a", "b"], None), + ("b", None, ["a", None], "Categorical.categories length are different"), + ], +) +def test_replace(to_replace, value, result, expected_error_msg): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError, match=expected_error_msg): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + +@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])]) def test_isin_empty(empty): s = pd.Categorical(["a", "b"]) expected = np.array([False, False], dtype=bool) @@ -71,10 +93,12 @@ def test_isin_empty(empty): class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 - def test_take_warns(self): + def test_take_default_allow_fill(self): cat = pd.Categorical(["a", "b"]) - with tm.assert_produces_warning(FutureWarning): - cat.take([0, -1]) + with tm.assert_produces_warning(None): + result = cat.take([0, -1]) + + assert result.equals(cat) def test_take_positive_no_warning(self): cat = pd.Categorical(["a", "b"]) @@ -84,13 +108,21 @@ def test_take_positive_no_warning(self): def test_take_bounds(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 cat = pd.Categorical(["a", "b", "a"]) - with pytest.raises(IndexError): + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "index 4 is out of bounds for size 3" + with pytest.raises(IndexError, match=msg): cat.take([4, 5], allow_fill=allow_fill) def test_take_empty(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 cat = pd.Categorical([], categories=["a", "b"]) - with pytest.raises(IndexError): + if allow_fill: + msg = "indices are out-of-bounds" + else: + msg = "cannot do a non-empty take from an empty axes" + with pytest.raises(IndexError, match=msg): cat.take([0], allow_fill=allow_fill) def test_positional_take(self, ordered_fixture): @@ -140,3 +172,12 @@ def test_take_fill_value_new_raises(self): xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." with pytest.raises(TypeError, match=xpr): cat.take([0, 1, -1], fill_value="d", allow_fill=True) + + def test_take_nd_deprecated(self): + cat = pd.Categorical(["a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning): + cat.take_nd([0, 1]) + + ci = pd.Index(cat) + with tm.assert_produces_warning(FutureWarning): + ci.take_nd([0, 1]) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 279f1492d7dad..90fcf12093909 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -5,22 +5,23 @@ from pandas.compat import PYPY -from pandas import Categorical, Index, Series +from pandas import Categorical, Index, NaT, Series, date_range +import pandas._testing as tm from pandas.api.types import is_scalar -import pandas.util.testing as tm class TestCategoricalAnalytics: - def test_min_max(self): - + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_not_ordered_raises(self, aggregation): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) msg = "Categorical is not ordered for operation {}" - with pytest.raises(TypeError, match=msg.format("min")): - cat.min() - with pytest.raises(TypeError, match=msg.format("max")): - cat.max() + agg_func = getattr(cat, aggregation) + + with pytest.raises(TypeError, match=msg.format(aggregation)): + agg_func() + def test_min_max_ordered(self): cat = Categorical(["a", "b", "c", "d"], ordered=True) _min = cat.min() _max = cat.max() @@ -35,31 +36,66 @@ def test_min_max(self): assert _min == "d" assert _max == "a" + @pytest.mark.parametrize( + "categories,expected", + [ + (list("ABC"), np.NaN), + ([1, 2, 3], np.NaN), + pytest.param( + Series(date_range("2020-01-01", periods=3), dtype="category"), + NaT, + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/29962" + ), + ), + ], + ) + @pytest.mark.parametrize("aggregation", ["min", "max"]) + def test_min_max_ordered_empty(self, categories, expected, aggregation): + # GH 30227 + cat = Categorical([], categories=list("ABC"), ordered=True) + + agg_func = getattr(cat, aggregation) + result = agg_func() + assert result is expected + + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_with_nan(self, skipna): + # GH 25303 cat = Categorical( [np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True ) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "b" + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) - _min = cat.min(numeric_only=True) - assert _min == "c" - _max = cat.max(numeric_only=True) - assert _max == "b" + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == "c" + assert _max == "b" cat = Categorical( [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True ) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == 1 - - _min = cat.min(numeric_only=True) - assert _min == 2 - _max = cat.max(numeric_only=True) - assert _max == 1 + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) + + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == 2 + assert _max == 1 + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_deprecate_numeric_only_min_max(self, method): + # GH 25303 + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + with tm.assert_produces_warning(expected_warning=FutureWarning): + getattr(cat, method)(numeric_only=True) @pytest.mark.parametrize( "values,categories,exp_mode", @@ -259,40 +295,42 @@ def test_map(self): # GH 12766: Return an index not an array tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) - def test_validate_inplace(self): + @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) + def test_validate_inplace_raises(self, value): cat = Categorical(["A", "B", "B", "C", "A"]) - invalid_values = [1, "True", [1, 2, 3], 5.0] - - for value in invalid_values: - with pytest.raises(ValueError): - cat.set_ordered(value=True, inplace=value) + msg = ( + 'For argument "inplace" expected type bool, ' + f"received type {type(value).__name__}" + ) + with pytest.raises(ValueError, match=msg): + cat.set_ordered(value=True, inplace=value) - with pytest.raises(ValueError): - cat.as_ordered(inplace=value) + with pytest.raises(ValueError, match=msg): + cat.as_ordered(inplace=value) - with pytest.raises(ValueError): - cat.as_unordered(inplace=value) + with pytest.raises(ValueError, match=msg): + cat.as_unordered(inplace=value) - with pytest.raises(ValueError): - cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) + with pytest.raises(ValueError, match=msg): + cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) - with pytest.raises(ValueError): - cat.rename_categories(["X", "Y", "Z"], inplace=value) + with pytest.raises(ValueError, match=msg): + cat.rename_categories(["X", "Y", "Z"], inplace=value) - with pytest.raises(ValueError): - cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) + with pytest.raises(ValueError, match=msg): + cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) - with pytest.raises(ValueError): - cat.add_categories(new_categories=["D", "E", "F"], inplace=value) + with pytest.raises(ValueError, match=msg): + cat.add_categories(new_categories=["D", "E", "F"], inplace=value) - with pytest.raises(ValueError): - cat.remove_categories(removals=["D", "E", "F"], inplace=value) + with pytest.raises(ValueError, match=msg): + cat.remove_categories(removals=["D", "E", "F"], inplace=value) - with pytest.raises(ValueError): - cat.remove_unused_categories(inplace=value) + with pytest.raises(ValueError, match=msg): + cat.remove_unused_categories(inplace=value) - with pytest.raises(ValueError): - cat.sort_values(inplace=value) + with pytest.raises(ValueError, match=msg): + cat.sort_values(inplace=value) def test_isna(self): exp = np.array([False, False, True]) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 42087b89a19b5..f49f70f5acf77 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -4,9 +4,9 @@ import pytest from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +import pandas._testing as tm from pandas.core.arrays.categorical import _recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalAPI: @@ -83,13 +83,15 @@ def test_rename_categories(self): ) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) - # Lengthen - with pytest.raises(ValueError): - cat.rename_categories([1, 2, 3, 4]) - - # Shorten - with pytest.raises(ValueError): - cat.rename_categories([1, 2]) + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_rename_categories_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items as the " + "old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.rename_categories(new_categories) def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 @@ -149,19 +151,19 @@ def test_reorder_categories(self): assert res is None tm.assert_categorical_equal(cat, new) - # not all "old" included in "new" + @pytest.mark.parametrize( + "new_categories", + [ + ["a"], # not all "old" included in "new" + ["a", "b", "d"], # still not all "old" in "new" + ["a", "b", "c", "d"], # all "old" included in "new", but too long + ], + ) + def test_reorder_categories_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"], ordered=True) - - with pytest.raises(ValueError): - cat.reorder_categories(["a"]) - - # still not all "old" in "new" - with pytest.raises(ValueError): - cat.reorder_categories(["a", "b", "d"]) - - # all "old" included in "new", but too long - with pytest.raises(ValueError): - cat.reorder_categories(["a", "b", "c", "d"]) + msg = "items in new_categories are not the same as in old categories" + with pytest.raises(ValueError, match=msg): + cat.reorder_categories(new_categories) def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) @@ -184,10 +186,6 @@ def test_add_categories(self): tm.assert_categorical_equal(cat, new) assert res is None - # new is in old categories - with pytest.raises(ValueError): - cat.add_categories(["d"]) - # GH 9927 cat = Categorical(list("abc"), ordered=True) expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) @@ -201,6 +199,13 @@ def test_add_categories(self): res = cat.add_categories(["d", "e"]) tm.assert_categorical_equal(res, expected) + def test_add_categories_existing_raises(self): + # new is in old categories + cat = Categorical(["a", "b", "c", "d"], ordered=True) + msg = re.escape("new categories must not include old categories: {'d'}") + with pytest.raises(ValueError, match=msg): + cat.add_categories(["d"]) + def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) exp_categories = Index(["c", "b", "a"]) @@ -453,13 +458,13 @@ def test_codes_immutable(self): tm.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="cannot set Categorical codes directly"): c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") # changes in the codes array should raise codes = c.codes - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="assignment destination is read-only"): codes[4] = 1 # But even after getting the codes, the original array should still be @@ -504,9 +509,3 @@ def test_recode_to_categories_large(self): new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected) - - def test_deprecated_get_values(self): - cat = Categorical(["a", "b", "c", "a"]) - with tm.assert_produces_warning(FutureWarning): - res = cat.get_values() - tm.assert_numpy_array_equal(res, np.array(cat)) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 237ec17f56974..70a23e9748dd1 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -16,6 +16,7 @@ Index, Interval, IntervalIndex, + MultiIndex, NaT, Series, Timestamp, @@ -23,7 +24,7 @@ period_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalConstructors: @@ -276,23 +277,19 @@ def test_constructor_with_index(self): def test_constructor_with_generator(self): # This was raising an Error in isna(single_val).any() because isna # returned a scalar for a generator - xrange = range exp = Categorical([0, 1, 2]) cat = Categorical((x for x in [0, 1, 2])) tm.assert_categorical_equal(cat, exp) - cat = Categorical(xrange(3)) + cat = Categorical(range(3)) tm.assert_categorical_equal(cat, exp) - # This uses xrange internally - from pandas.core.index import MultiIndex - MultiIndex.from_product([range(5), ["a", "b", "c"]]) # check that categories accept generators and sequences cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) tm.assert_categorical_equal(cat, exp) - cat = Categorical([0, 1, 2], categories=xrange(3)) + cat = Categorical([0, 1, 2], categories=range(3)) tm.assert_categorical_equal(cat, exp) @pytest.mark.parametrize( @@ -311,7 +308,7 @@ def test_constructor_with_datetimelike(self, dtl): c = Categorical(s) expected = type(dtl)(s) - expected.freq = None + expected._data.freq = None tm.assert_index_equal(c.categories, expected) tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8")) @@ -322,7 +319,7 @@ def test_constructor_with_datetimelike(self, dtl): c = Categorical(s2) expected = type(dtl)(s2.dropna()) - expected.freq = None + expected._data.freq = None tm.assert_index_equal(c.categories, expected) @@ -526,13 +523,14 @@ def test_from_codes_with_float(self): codes = [1.0, 2.0, 0] # integer, but in float dtype dtype = CategoricalDtype(categories=["a", "b", "c"]) - with tm.assert_produces_warning(FutureWarning): - cat = Categorical.from_codes(codes, dtype.categories) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) + # empty codes should not raise for floats + Categorical.from_codes([], dtype.categories) + + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype.categories) - with tm.assert_produces_warning(FutureWarning): - cat = Categorical.from_codes(codes, dtype=dtype) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) + with pytest.raises(ValueError, match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype=dtype) codes = [1.1, 2.0, 0] # non-integer with pytest.raises(ValueError, match="codes need to be array-like integers"): diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index c08ad1da38671..19746d7d72162 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalDtypes: @@ -161,14 +161,6 @@ def test_astype_category(self, dtype_ordered, cat_ordered): expected = cat tm.assert_categorical_equal(result, expected) - def test_astype_category_ordered_none_deprecated(self): - # GH 26336 - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb")) - cat = Categorical(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(FutureWarning): - cat.astype(cdt2) - def test_iter_python_types(self): # GH-19909 cat = Categorical([1, 2]) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 6edd7fd00b707..85d5a6a3dc3ac 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -3,9 +3,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalIndexingWithFactor(TestCategorical): @@ -63,7 +63,8 @@ def test_setitem_different_unordered_raises(self, other): # GH-24142 target = pd.Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) - with pytest.raises(ValueError): + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(ValueError, match=msg): target[mask] = other[mask] @pytest.mark.parametrize( @@ -78,8 +79,8 @@ def test_setitem_same_ordered_rasies(self, other): # Gh-24142 target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) - - with pytest.raises(ValueError): + msg = "Cannot set a Categorical with another, without identical categories" + with pytest.raises(ValueError, match=msg): target[mask] = other[mask] @@ -152,13 +153,15 @@ def test_categories_assigments(self): tm.assert_numpy_array_equal(s.__array__(), exp) tm.assert_index_equal(s.categories, Index([1, 2, 3])) - # lengthen - with pytest.raises(ValueError): - s.categories = [1, 2, 3, 4] - - # shorten - with pytest.raises(ValueError): - s.categories = [1, 2] + @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) + def test_categories_assigments_wrong_length_raises(self, new_categories): + cat = Categorical(["a", "b", "c", "a"]) + msg = ( + "new categories need to have the same number of items " + "as the old categories!" + ) + with pytest.raises(ValueError, match=msg): + cat.categories = new_categories # Combinations of sorted/unique: @pytest.mark.parametrize( @@ -206,13 +209,11 @@ def test_where_other_categorical(self): expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) tm.assert_series_equal(result, expected) - def test_where_warns(self): + def test_where_new_category_raises(self): ser = pd.Series(Categorical(["a", "b", "c"])) - with tm.assert_produces_warning(FutureWarning): - result = ser.where([True, False, True], "d") - - expected = pd.Series(np.array(["a", "d", "c"], dtype="object")) - tm.assert_series_equal(result, expected) + msg = "Cannot setitem on a Categorical with a new category" + with pytest.raises(ValueError, match=msg): + ser.where([True, False, True], "d") def test_where_ordered_differs_rasies(self): ser = pd.Series( @@ -221,11 +222,8 @@ def test_where_ordered_differs_rasies(self): other = Categorical( ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True ) - with tm.assert_produces_warning(FutureWarning): - result = ser.where([True, False, True], other) - - expected = pd.Series(np.array(["a", "c", "c"], dtype=object)) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match="without identical categories"): + ser.where([True, False, True], other) @pytest.mark.parametrize("index", [True, False]) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 3037ac79cd592..211bf091ee17d 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalMissing: diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 22c1d5373372a..8643e7f6f89c1 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalOpsWithFactor(TestCategorical): @@ -48,7 +48,7 @@ def test_comparisons(self): tm.assert_numpy_array_equal(result, expected) result = self.factor == "d" - expected = np.repeat(False, len(self.factor)) + expected = np.zeros(len(self.factor), dtype=bool) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals @@ -73,19 +73,25 @@ def test_comparisons(self): tm.assert_numpy_array_equal(res, exp) # Only categories with same categories can be compared - with pytest.raises(TypeError): + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): cat > cat_rev cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) - with pytest.raises(TypeError): + msg = ( + "Categoricals can only be compared if 'categories' are the same. " + "Categories are different lengths" + ) + with pytest.raises(TypeError, match=msg): cat_rev > cat_rev_base2 # Only categories with same ordering information can be compared cat_unorderd = cat.set_ordered(False) assert not (cat > cat).any() - with pytest.raises(TypeError): + msg = "Categoricals can only be compared if 'ordered' is the same" + with pytest.raises(TypeError, match=msg): cat > cat_unorderd # comparison (in both directions) with Series will raise @@ -131,18 +137,6 @@ def test_compare_frame(self): df = DataFrame(cat) - for op in [ - operator.eq, - operator.ne, - operator.ge, - operator.gt, - operator.le, - operator.lt, - ]: - with pytest.raises(ValueError): - # alignment raises unless we transpose - op(cat, df) - result = cat == df.T expected = DataFrame([[True, True, True, True]]) tm.assert_frame_equal(result, expected) @@ -151,6 +145,15 @@ def test_compare_frame(self): expected = DataFrame([[False, True, True, False]]) tm.assert_frame_equal(result, expected) + def test_compare_frame_raises(self, all_compare_operators): + # alignment raises unless we transpose + op = getattr(operator, all_compare_operators) + cat = Categorical(["a", "b", 2, "a"]) + df = DataFrame(cat) + msg = "Unable to coerce to Series, length must be 1: given 4" + with pytest.raises(ValueError, match=msg): + op(cat, df) + def test_datetime_categorical_comparison(self): dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) @@ -169,8 +172,8 @@ def test_comparison_with_unknown_scalars(self): cat = Categorical([1, 2, 3], ordered=True) msg = ( - "Cannot compare a Categorical for op __{}__ with a scalar," - " which is not a category" + "Cannot compare a Categorical for op __{}__ with a scalar, " + "which is not a category" ) with pytest.raises(TypeError, match=msg.format("lt")): cat < 4 @@ -255,7 +258,8 @@ def test_comparisons(self, data, reverse, base): tm.assert_numpy_array_equal(res_rev.values, exp_rev2) # Only categories with same categories can be compared - with pytest.raises(TypeError): + msg = "Categoricals can only be compared if 'categories' are the same" + with pytest.raises(TypeError, match=msg): cat > cat_rev # categorical cannot be compared to Series or numpy array, and also @@ -367,7 +371,9 @@ def test_numeric_like_ops(self): # numpy ops s = Series(Categorical([1, 2, 3, 4])) - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="Categorical cannot perform the operation sum" + ): np.sum(s) # numeric ops on a Series @@ -384,7 +390,8 @@ def test_numeric_like_ops(self): getattr(s, op)(2) # invalid ufunc - with pytest.raises(TypeError): + msg = "Object with dtype category cannot perform the numpy op log" + with pytest.raises(TypeError, match=msg): np.log(s) def test_contains(self): @@ -394,7 +401,7 @@ def test_contains(self): assert "b" in c assert "z" not in c assert np.nan not in c - with pytest.raises(TypeError): + with pytest.raises(TypeError, match="unhashable type: 'list'"): assert [1] in c # assert codes NOT in index diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 9321813b42b33..d08c4b47dd3cb 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -147,8 +147,6 @@ def test_categorical_repr_datetime(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) - # TODO(wesm): exceeding 80 characters in the console is not good - # behavior exp = ( "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n" diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index a0b09e19ece6e..2a0ef043bf9a9 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSort: diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index cfc7b8541302f..b80d0ff41aba6 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -1,5 +1,5 @@ from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSubclassing: diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 53733770ed954..f66c327e9967d 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,29 +1,19 @@ import pytest -import pandas as pd -import pandas.util.testing as tm +from pandas.util._test_decorators import async_mark + +import pandas._testing as tm class TestCategoricalWarnings: - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = Categorical([])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("c.", 1)) - - def test_CategoricalAccessor_categorical_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - pd.Series(["a", "b"], dtype="category").cat.categorical - - def test_CategoricalAccessor_name_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - pd.Series(["a", "b"], dtype="category").cat.name - - def test_CategoricalAccessor_index_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - pd.Series(["a", "b"], dtype="category").cat.index diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 655a6e717119b..e046d87780bb4 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -11,8 +13,8 @@ date_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture( @@ -103,3 +105,110 @@ def test_repr(): "Length: 2, closed: right, dtype: interval[int64]" ) assert result == expected + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +def test_arrow_array(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="different 'subtype'"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pyarrow_skip +@pytest.mark.parametrize( + "breaks", + [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], + ids=["int", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index 43601ea301568..b4de80dc00a4e 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -3,8 +3,8 @@ import pytest from pandas import Interval, IntervalIndex, Timedelta, Timestamp +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture(params=[IntervalArray, IntervalIndex]) @@ -83,8 +83,6 @@ def test_overlaps_na(self, constructor, start_shift): ) def test_overlaps_invalid_type(self, constructor, other): interval_container = constructor.from_breaks(range(5)) - msg = "`other` must be Interval-like, got {other}".format( - other=type(other).__name__ - ) + msg = f"`other` must be Interval-like, got {type(other).__name__}" with pytest.raises(TypeError, match=msg): interval_container.overlaps(other) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index eab174862818c..d8a1831cd61ec 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -6,7 +6,8 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class TestSeriesAccessor: @@ -31,7 +32,7 @@ def test_accessor_raises(self): def test_from_spmatrix(self, format, labels, dtype): import scipy.sparse - sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) @@ -48,7 +49,7 @@ def test_from_spmatrix(self, format, labels, dtype): def test_from_spmatrix_columns(self, columns): import scipy.sparse - dtype = pd.SparseDtype("float64", 0.0) + dtype = SparseDtype("float64", 0.0) mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) @@ -67,9 +68,9 @@ def test_to_coo(self): def test_to_dense(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)), - "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)), - "C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)), + "A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)), + "B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)), + "C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)), }, index=["b", "a"], ) @@ -82,8 +83,8 @@ def test_to_dense(self): def test_density(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0, 2, 1], fill_value=0), - "B": pd.SparseArray([0, 1, 1, 1], fill_value=0), + "A": SparseArray([1, 0, 2, 1], fill_value=0), + "B": SparseArray([0, 1, 1, 1], fill_value=0), } ) res = df.sparse.density @@ -99,9 +100,7 @@ def test_series_from_coo(self, dtype, dense_index): A = scipy.sparse.eye(3, format="coo", dtype=dtype) result = pd.Series.sparse.from_coo(A, dense_index=dense_index) index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - expected = pd.Series( - pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index - ) + expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index) if dense_index: expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) @@ -117,3 +116,8 @@ def test_series_from_coo_incorrect_format_raises(self): TypeError, match="Expected coo_matrix. Got csr_matrix instead." ): pd.Series.sparse.from_coo(m) + + def test_with_column_named_sparse(self): + # https://github.com/pandas-dev/pandas/issues/30758 + df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index f1d2803ce5505..76442a63ccb0f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -4,9 +4,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core import ops -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype @pytest.fixture(params=["integer", "block"]) @@ -24,7 +24,7 @@ def mix(request): class TestSparseArrayArithmetics: _base = np.array - _klass = pd.SparseArray + _klass = SparseArray def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) @@ -391,15 +391,15 @@ def test_mixed_array_comparison(self, kind): @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = op(arr, [0, 1]) - expected = op(arr, pd.SparseArray([0, 1])) + expected = op(arr, SparseArray([0, 1])) tm.assert_sp_array_equal(result, expected) def test_with_dataframe(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) df = pd.DataFrame([[1, 2], [3, 4]]) result = arr.__add__(df) assert result is NotImplemented @@ -407,7 +407,7 @@ def test_with_dataframe(): def test_with_zerodim_ndarray(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = arr * np.array(2) expected = arr * 2 @@ -416,23 +416,23 @@ def test_with_zerodim_ndarray(): @pytest.mark.parametrize("ufunc", [np.abs, np.exp]) @pytest.mark.parametrize( - "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] + "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])] ) def test_ufuncs(ufunc, arr): result = ufunc(arr) fill_value = ufunc(arr.fill_value) - expected = pd.SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) + expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) tm.assert_sp_array_equal(result, expected) @pytest.mark.parametrize( "a, b", [ - (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), ], ) @pytest.mark.parametrize("ufunc", [np.add, np.greater]) @@ -440,12 +440,12 @@ def test_binary_ufuncs(ufunc, a, b): # can't say anything about fill value here. result = ufunc(a, b) expected = ufunc(np.asarray(a), np.asarray(b)) - assert isinstance(result, pd.SparseArray) + assert isinstance(result, SparseArray) tm.assert_numpy_array_equal(np.asarray(result), expected) def test_ndarray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) ndarray += sparray expected = np.array([0, 3, 2, 3]) @@ -453,19 +453,19 @@ def test_ndarray_inplace(): def test_sparray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) sparray += ndarray - expected = pd.SparseArray([0, 3, 2, 3], fill_value=0) + expected = SparseArray([0, 3, 2, 3], fill_value=0) tm.assert_sp_array_equal(sparray, expected) @pytest.mark.parametrize("fill_value", [True, False]) def test_invert(fill_value): arr = np.array([True, False, False, True]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = ~sparray - expected = pd.SparseArray(~arr, fill_value=not fill_value) + expected = SparseArray(~arr, fill_value=not fill_value) tm.assert_sp_array_equal(result, expected) @@ -473,7 +473,7 @@ def test_invert(fill_value): @pytest.mark.parametrize("op", [operator.pos, operator.neg]) def test_unary_op(op, fill_value): arr = np.array([0, 1, np.nan, 2]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = op(sparray) - expected = pd.SparseArray(op(arr), fill_value=op(fill_value)) + expected = SparseArray(op(arr), fill_value=op(fill_value)) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f9bb4981df7df..baca18239b929 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import isna +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm @pytest.fixture(params=["integer", "block"]) @@ -307,11 +307,12 @@ def test_take_filling(self): with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) - with pytest.raises(IndexError): + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, -6])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5]), allow_fill=True) def test_take_filling_fill_value(self): @@ -340,11 +341,12 @@ def test_take_filling_fill_value(self): with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) - with pytest.raises(IndexError): + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, -6])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5]), fill_value=True) def test_take_filling_all_nan(self): @@ -358,11 +360,12 @@ def test_take_filling_all_nan(self): expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) - with pytest.raises(IndexError): + msg = "out of bounds value in 'indices'" + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, -6])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5])) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): sparse.take(np.array([1, 5]), fill_value=True) def test_set_item(self): @@ -467,7 +470,7 @@ def test_astype(self): arr.astype("Sparse[i8]") def test_astype_bool(self): - a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) @@ -617,8 +620,7 @@ def test_dense_repr(self, vals, fill_value): res = arr.to_dense() tm.assert_numpy_array_equal(res, vals) - with tm.assert_produces_warning(FutureWarning): - res2 = arr.get_values() + res2 = arr._internal_get_values() tm.assert_numpy_array_equal(res2, vals) @@ -658,24 +660,29 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:,] # noqa: E231 + res = sparse[ + 4:, + ] # noqa: E231 exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) - with pytest.raises(IndexError): + msg = "too many indices for array" + with pytest.raises(IndexError, match=msg): sparse[4:, :] - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): # check numpy compat dense[4:, :] def test_boolean_slice_empty(self): - arr = pd.SparseArray([0, 1, 2]) + arr = SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype @@ -821,13 +828,13 @@ def test_fillna_overlap(self): def test_nonzero(self): # Tests regression #21172. - sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) - result, = sa.nonzero() + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) - sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) - result, = sa.nonzero() + sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) @@ -1003,7 +1010,7 @@ def test_cumsum(self, data, expected, numpy): np.cumsum(SparseArray(data), out=out) else: axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid. - msg = "axis\\(={axis}\\) out of bounds".format(axis=axis) + msg = re.escape(f"axis(={axis}) out of bounds") with pytest.raises(ValueError, match=msg): SparseArray(data).cumsum(axis=axis) @@ -1079,11 +1086,11 @@ def test_ufunc_args(self): @pytest.mark.parametrize("fill_value", [0.0, np.nan]) def test_modf(self, fill_value): # https://github.com/pandas-dev/pandas/issues/26946 - sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) + sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) r1, r2 = np.modf(sparse) e1, e2 = np.modf(np.asarray(sparse)) - tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) - tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) + tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value)) def test_nbytes_integer(self): arr = SparseArray([1, 0, 0, 0, 2], kind="integer") @@ -1099,7 +1106,7 @@ def test_nbytes_block(self): assert result == 24 def test_asarray_datetime64(self): - s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) + s = SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s) def test_density(self): @@ -1201,7 +1208,7 @@ def test_first_fill_value_loc(arr, loc): ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): - a = pd.SparseArray(arr, fill_value=fill_value).unique() + a = SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() assert isinstance(a, SparseArray) a = np.asarray(a) @@ -1240,12 +1247,3 @@ def test_map_missing(): result = arr.map({0: 10, 1: 11}) tm.assert_sp_array_equal(result, expected) - - -def test_deprecated_values(): - arr = SparseArray([0, 1, 2]) - - with tm.assert_produces_warning(FutureWarning): - result = arr.values - - tm.assert_numpy_array_equal(result, arr.to_dense()) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index 4ad1aa60e7b4f..f1697dc9ff7ce 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -1,17 +1,17 @@ import numpy as np import pytest -import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray class TestSparseArrayConcat: @pytest.mark.parametrize("kind", ["integer", "block"]) def test_basic(self, kind): - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=kind) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=kind) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) # Can't make any assertions about the sparse index itself # since we aren't don't merge sparse blocs across arrays # in to_concat @@ -22,10 +22,10 @@ def test_basic(self, kind): @pytest.mark.parametrize("kind", ["integer", "block"]) def test_uses_first_kind(self, kind): other = "integer" if kind == "block" else "block" - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=other) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=other) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index aa8d2afca11e6..5e9e2d854f577 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -80,7 +82,9 @@ def test_not_equal(a, b): def test_construct_from_string_raises(): - with pytest.raises(TypeError): + with pytest.raises( + TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'" + ): SparseDtype.construct_from_string("not a dtype") @@ -175,9 +179,20 @@ def test_update_dtype(original, dtype, expected): @pytest.mark.parametrize( - "original, dtype", - [(SparseDtype(float, np.nan), int), (SparseDtype(str, "abc"), int)], + "original, dtype, expected_error_msg", + [ + ( + SparseDtype(float, np.nan), + int, + re.escape("Cannot convert non-finite values (NA or inf) to integer"), + ), + ( + SparseDtype(str, "abc"), + int, + re.escape("invalid literal for int() with base 10: 'abc'"), + ), + ], ) -def test_update_dtype_raises(original, dtype): - with pytest.raises(ValueError): +def test_update_dtype_raises(original, dtype, expected_error_msg): + with pytest.raises(ValueError, match=expected_error_msg): original.update_dtype(dtype) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index a6836c58348b3..a2f861d378e67 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -7,8 +7,8 @@ import pandas.util._test_decorators as td from pandas import Series +import pandas._testing as tm from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index -import pandas.util.testing as tm TEST_LENGTH = 20 @@ -596,6 +596,6 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) def test_op(self, opname): - sparse_op = getattr(splib, "sparse_{opname}_float64".format(opname=opname)) + sparse_op = getattr(splib, f"sparse_{opname}_float64") python_op = getattr(operator, opname) self._op_tests(sparse_op, python_op) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efe2b4e0b2deb..33e68f029922e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -6,13 +6,25 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm + + +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected def test_none_to_nan(): a = pd.arrays.StringArray._from_sequence(["a", None, "b"]) assert a[1] is not None - assert np.isnan(a[1]) + assert a[1] is pd.NA def test_setitem_validates(): @@ -24,6 +36,15 @@ def test_setitem_validates(): a[:] = np.array([1, 2]) +def test_setitem_with_scalar_string(): + # is_float_dtype considers some strings, like 'd', to be floats + # which can cause issues. + arr = pd.array(["a", "c"], dtype="string") + arr[0] = "d" + expected = pd.array(["d", "c"], dtype="string") + tm.assert_extension_array_equal(arr, expected) + + @pytest.mark.parametrize( "input, method", [ @@ -135,6 +156,37 @@ def test_add_frame(): tm.assert_frame_equal(result, expected) +def test_comparison_methods_scalar(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = "a" + result = getattr(a, op_name)(other) + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_comparison_methods_array(all_compare_operators): + op_name = all_compare_operators + + a = pd.array(["a", None, "c"], dtype="string") + other = [None, None, "c"] + result = getattr(a, op_name)(other) + expected = np.empty_like(a, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array(["a", "b"], dtype="S1")) @@ -171,3 +223,30 @@ def test_arrow_array(): arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array(["a", "b", None], dtype="string") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.StringDtype) + tm.assert_frame_equal(result, df) + # ensure the missing value is represented by NA and not np.nan or None + assert result.loc[2, "a"] is pd.NA + + +def test_value_counts_na(): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e8d9ecfac61e4..b1b5a9482e34f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -8,25 +8,34 @@ from pandas.core.dtypes.dtypes import registry import pandas as pd +import pandas._testing as tm from pandas.api.extensions import register_extension_dtype from pandas.api.types import is_scalar +from pandas.arrays import ( + BooleanArray, + DatetimeArray, + IntegerArray, + IntervalArray, + SparseArray, + StringArray, + TimedeltaArray, +) from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal -import pandas.util.testing as tm @pytest.mark.parametrize( "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], None, IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -45,37 +54,33 @@ ( [1, 2], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( np.array([1, 2], dtype="datetime64[ns]"), None, - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( pd.DatetimeIndex(["2000", "2001"]), np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( pd.DatetimeIndex(["2000", "2001"]), None, - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), # Datetime (tz-aware) ( ["2000", "2001"], pd.DatetimeTZDtype(tz="CET"), - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -83,17 +88,17 @@ ( ["1H", "2H"], np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), None, - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), @@ -106,13 +111,19 @@ ( [pd.Interval(1, 2), pd.Interval(3, 4)], "interval", - pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), + IntervalArray.from_tuples([(1, 2), (3, 4)]), ), # Sparse - ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), + ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # String + (["a", None], "string", StringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), + # Boolean + ([True, None], "boolean", BooleanArray._from_sequence([True, None])), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -139,15 +150,15 @@ def test_array(data, dtype, expected): def test_array_copy(): a = np.array([1, 2]) # default is to copy - b = pd.array(a) + b = pd.array(a, dtype=a.dtype) assert np.shares_memory(a, b._ndarray) is False # copy=True - b = pd.array(a, copy=True) + b = pd.array(a, dtype=a.dtype, copy=True) assert np.shares_memory(a, b._ndarray) is False # copy=False - b = pd.array(a, copy=False) + b = pd.array(a, dtype=a.dtype, copy=False) assert np.shares_memory(a, b._ndarray) is True @@ -163,31 +174,28 @@ def test_array_copy(): period_array(["2000", "2001"], freq="D"), ), # interval - ( - [pd.Interval(0, 1), pd.Interval(1, 2)], - pd.arrays.IntervalArray.from_breaks([0, 1, 2]), - ), + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( np.array([1, 2], dtype="M8[ns]"), - pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + DatetimeArray(np.array([1, 2], dtype="M8[ns]")), ), ( np.array([1, 2], dtype="M8[us]"), - pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), ), # datetimetz ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -196,21 +204,30 @@ def test_array_copy(): datetime.datetime(2000, 1, 1, tzinfo=cet), datetime.datetime(2001, 1, 1, tzinfo=cet), ], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + DatetimeArray._from_sequence(["2000", "2001"], tz=cet), ), # timedelta ( [pd.Timedelta("1H"), pd.Timedelta("2H")], - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( np.array([1, 2], dtype="m8[ns]"), - pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), ), ( np.array([1, 2], dtype="m8[us]"), - pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), + # integer + ([1, 2], IntegerArray._from_sequence([1, 2])), + ([1, None], IntegerArray._from_sequence([1, None])), + # string + (["a", "b"], StringArray._from_sequence(["a", "b"])), + (["a", None], StringArray._from_sequence(["a", None])), + # Boolean + ([True, False], BooleanArray._from_sequence([True, False])), + ([True, None], BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): @@ -241,7 +258,7 @@ def test_array_inference_fails(data): @pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): - pd.array(data) + pd.array(data, dtype="int64") def test_scalar_raises(): @@ -260,6 +277,13 @@ class DecimalDtype2(DecimalDtype): @classmethod def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ return DecimalArray2 @@ -272,8 +296,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return super()._from_sequence(scalars, dtype=dtype, copy=copy) -@pytest.mark.parametrize("box", [pd.Series, pd.Index]) -def test_array_unboxes(box): +def test_array_unboxes(index_or_series): + box = index_or_series + data = box([decimal.Decimal("1"), decimal.Decimal("2")]) # make sure it works with pytest.raises(TypeError): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py new file mode 100644 index 0000000000000..cc8d0cdcb518d --- /dev/null +++ b/pandas/tests/arrays/test_boolean.py @@ -0,0 +1,881 @@ +import operator + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import BooleanArray +from pandas.core.arrays.boolean import coerce_to_array +from pandas.tests.extension.base import BaseOpsUtil + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return pd.BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +def test_boolean_array_constructor(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.tolist(), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, mask.tolist()) + + with pytest.raises(TypeError, match="values should be boolean numpy array"): + BooleanArray(values.astype(int), mask) + + with pytest.raises(TypeError, match="mask should be boolean numpy array"): + BooleanArray(values, None) + + with pytest.raises(ValueError, match="values must be a 1D array"): + BooleanArray(values.reshape(1, -1), mask) + + with pytest.raises(ValueError, match="mask must be a 1D array"): + BooleanArray(values, mask.reshape(1, -1)) + + +def test_boolean_array_constructor_copy(): + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + result = BooleanArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = BooleanArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_boolean_array(): + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, False]) + ) + + result = pd.array([True, False, True], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True]), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + expected = BooleanArray( + np.array([True, False, True]), np.array([False, False, True]) + ) + + result = pd.array([True, False, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_all_none(): + expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True])) + + result = pd.array([None, None, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]), + ([True, np.nan], [True, None]), + ([True, pd.NA], [True, None]), + ([np.nan, np.nan], [None, None]), + (np.array([np.nan, np.nan], dtype=float), [None, None]), + ], +) +def test_to_boolean_array_missing_indicators(a, b): + result = pd.array(a, dtype="boolean") + expected = pd.array(b, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + # "foo", + [1, 2], + [1.0, 2.0], + pd.date_range("20130101", periods=2), + np.array(["foo"]), + np.array([1, 2]), + np.array([1.0, 2.0]), + [np.nan, {"a": 1}], + ], +) +def test_to_boolean_array_error(values): + # error in converting existing arrays to BooleanArray + with pytest.raises(TypeError): + pd.array(values, dtype="boolean") + + +def test_to_boolean_array_from_integer_array(): + result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1, 0, 1, None]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_from_float_array(): + result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_to_boolean_array_integer_like(): + # integers of 0's and 1's + result = pd.array([1, 0, 1, 0], dtype="boolean") + expected = pd.array([True, False, True, False], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + # with missing values + result = pd.array([1, 0, 1, None], dtype="boolean") + expected = pd.array([True, False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_coerce_to_array(): + # TODO this is currently not public API + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is values + assert result._mask is mask + result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True)) + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + assert result._data is not values + assert result._mask is not mask + + # mixed missing from values and mask + values = [True, False, None, False] + mask = np.array([False, False, False, True], dtype="bool") + result = BooleanArray(*coerce_to_array(values, mask=mask)) + expected = BooleanArray( + np.array([True, False, True, True]), np.array([False, False, True, True]) + ) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask)) + tm.assert_extension_array_equal(result, expected) + result = BooleanArray(*coerce_to_array(values, mask=mask.tolist())) + tm.assert_extension_array_equal(result, expected) + + # raise errors for wrong dimension + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + + with pytest.raises(ValueError, match="values must be a 1D list-like"): + coerce_to_array(values.reshape(1, -1)) + + with pytest.raises(ValueError, match="mask must be a 1D list-like"): + coerce_to_array(values, mask=mask.reshape(1, -1)) + + +def test_coerce_to_array_from_boolean_array(): + # passing BooleanArray to coerce_to_array + values = np.array([True, False, True, False], dtype="bool") + mask = np.array([False, False, False, True], dtype="bool") + arr = BooleanArray(values, mask) + result = BooleanArray(*coerce_to_array(arr)) + tm.assert_extension_array_equal(result, arr) + # no copy + assert result._data is arr._data + assert result._mask is arr._mask + + result = BooleanArray(*coerce_to_array(arr), copy=True) + tm.assert_extension_array_equal(result, arr) + assert result._data is not arr._data + assert result._mask is not arr._mask + + with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"): + coerce_to_array(arr, mask=mask) + + +def test_coerce_to_numpy_array(): + # with missing values -> object dtype + arr = pd.array([True, False, None], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # also with no missing values -> object dtype + arr = pd.array([True, False, True], dtype="boolean") + result = np.array(arr) + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # force bool dtype + result = np.array(arr, dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + # with missing values will raise error + arr = pd.array([True, False, None], dtype="boolean") + with pytest.raises(ValueError): + np.array(arr, dtype="bool") + + +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + +def test_astype(): + # with missing values + arr = pd.array([True, False, None], dtype="boolean") + + with pytest.raises(ValueError, match="cannot convert NA to integer"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert float NaN to"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.astype("int64") + expected = np.array([1, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("boolean") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, arr) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([True, False, None], dtype="boolean") + + result = arr.astype("Int64") + expected = pd.array([1, 0, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("na", [None, np.nan, pd.NA]) +def test_setitem_missing_values(na): + arr = pd.array([True, False, None], dtype="boolean") + expected = pd.array([True, None, None], dtype="boolean") + arr[1] = na + tm.assert_extension_array_equal(arr, expected) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] +) +def test_ufuncs_binary(ufunc): + # two BooleanArrays + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a, a) + expected = pd.array(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s, a) + expected = pd.Series(ufunc(a._data, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + # Boolean with numpy array + arr = np.array([True, True, False]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a._data, arr), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # BooleanArray with scalar + result = ufunc(a, True) + expected = pd.array(ufunc(a._data, True), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + result = ufunc(True, a) + expected = pd.array(ufunc(True, a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + # not handled types + with pytest.raises(TypeError): + ufunc(a, "test") + + +@pytest.mark.parametrize("ufunc", [np.logical_not]) +def test_ufuncs_unary(ufunc): + a = pd.array([True, False, None], dtype="boolean") + result = ufunc(a) + expected = pd.array(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(ufunc(a._data), dtype="boolean") + expected[a._mask] = np.nan + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("values", [[True, False], [True, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="boolean") + with pytest.raises(NotImplementedError): + np.add.reduce(a) + + +class TestLogicalOps(BaseOpsUtil): + def test_numpy_scalars_ok(self, all_logical_operators): + a = pd.array([True, False, None], dtype="boolean") + op = getattr(a, all_logical_operators) + + tm.assert_extension_array_equal(op(True), op(np.bool(True))) + tm.assert_extension_array_equal(op(False), op(np.bool(False))) + + def get_op_from_name(self, op_name): + short_opname = op_name.strip("_") + short_opname = short_opname if "xor" in short_opname else short_opname + "_" + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + def test_empty_ok(self, all_logical_operators): + a = pd.array([], dtype="boolean") + op_name = all_logical_operators + result = getattr(a, op_name)(True) + tm.assert_extension_array_equal(a, result) + + result = getattr(a, op_name)(False) + tm.assert_extension_array_equal(a, result) + + # TODO: pd.NA + # result = getattr(a, op_name)(pd.NA) + # tm.assert_extension_array_equal(a, result) + + def test_logical_length_mismatch_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Lengths must match to compare" + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)([True, False]) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(np.array([True, False])) + + with pytest.raises(ValueError, match=msg): + getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + + def test_logical_nan_raises(self, all_logical_operators): + op_name = all_logical_operators + a = pd.array([True, False, None], dtype="boolean") + msg = "Got float instead" + + with pytest.raises(TypeError, match=msg): + getattr(a, op_name)(np.nan) + + @pytest.mark.parametrize("other", ["a", 1]) + def test_non_bool_or_na_other_raises(self, other, all_logical_operators): + a = pd.array([True, False], dtype="boolean") + with pytest.raises(TypeError, match=str(type(other).__name__)): + getattr(a, all_logical_operators)(other) + + def test_kleene_or(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a | b + expected = pd.array( + [True, True, True, True, False, None, True, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [True, None, None]), + (True, [True, True, True]), + (np.bool_(True), [True, True, True]), + (False, [True, False, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_or_scalar(self, other, expected): + # TODO: test True & False + a = pd.array([True, False, None], dtype="boolean") + result = a | other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other | a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_and(self): + # A clear test of behavior. + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a & b + expected = pd.array( + [True, False, None, False, False, False, None, False, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, False, None]), + (True, [True, False, None]), + (False, [False, False, False]), + (np.bool_(True), [True, False, None]), + (np.bool_(False), [False, False, False]), + ], + ) + def test_kleene_and_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a & other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other & a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_kleene_xor(self): + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + result = a ^ b + expected = pd.array( + [False, True, None, True, False, None, None, None, None], dtype="boolean" + ) + tm.assert_extension_array_equal(result, expected) + + result = b ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + @pytest.mark.parametrize( + "other, expected", + [ + (pd.NA, [None, None, None]), + (True, [False, True, None]), + (np.bool_(True), [False, True, None]), + (np.bool_(False), [True, False, None]), + ], + ) + def test_kleene_xor_scalar(self, other, expected): + a = pd.array([True, False, None], dtype="boolean") + result = a ^ other + expected = pd.array(expected, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + result = other ^ a + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + @pytest.mark.parametrize( + "other", [True, False, pd.NA, [True, False, None] * 3], + ) + def test_no_masked_assumptions(self, other, all_logical_operators): + # The logical operations should not assume that masked values are False! + a = pd.arrays.BooleanArray( + np.array([True, True, True, False, False, False, True, False, True]), + np.array([False] * 6 + [True, True, True]), + ) + b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + if isinstance(other, list): + other = pd.array(other, dtype="boolean") + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + if isinstance(other, BooleanArray): + other._data[other._mask] = True + a._data[a._mask] = False + + result = getattr(a, all_logical_operators)(other) + expected = getattr(b, all_logical_operators)(other) + tm.assert_extension_array_equal(result, expected) + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = pd.Series(data._data) + expected = op(expected, other) + expected = expected.astype("boolean") + # propagate NAs + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + def test_compare_scalar(self, data, all_compare_operators): + op_name = all_compare_operators + self._compare_other(data, op_name, True) + + def test_compare_array(self, data, all_compare_operators): + op_name = all_compare_operators + other = pd.array([True] * len(data), dtype="boolean") + self._compare_other(data, op_name, other) + other = np.array([True] * len(data)) + self._compare_other(data, op_name, other) + other = pd.Series([True] * len(data)) + self._compare_other(data, op_name, other) + + @pytest.mark.parametrize("other", [True, False, pd.NA]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None], dtype="boolean") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True, False, None], dtype="boolean") + ) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + b = pd.array([True, False, None] * 3, dtype="boolean") + + result = op(a, b) + + values = op(a._data, b._data) + mask = a._mask | b._mask + expected = BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = None + tm.assert_extension_array_equal( + a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") + ) + tm.assert_extension_array_equal( + b, pd.array([True, False, None] * 3, dtype="boolean") + ) + + +class TestArithmeticOps(BaseOpsUtil): + def test_error(self, data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + opa = getattr(data, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) + + # 2d + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + + with pytest.raises(NotImplementedError): + opa(np.arange(len(s)).reshape(-1, len(s))) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_reductions_return_types(dropna, data, all_numeric_reductions): + op = all_numeric_reductions + s = pd.Series(data) + if dropna: + s = s.dropna() + + if op in ("sum", "prod"): + assert isinstance(getattr(s, op)(), np.int64) + elif op in ("min", "max"): + assert isinstance(getattr(s, op)(), np.bool_) + else: + # "mean", "std", "var", "median", "kurt", "skew" + assert isinstance(getattr(s, op)(), np.float64) + + +@pytest.mark.parametrize( + "values, exp_any, exp_all, exp_any_noskip, exp_all_noskip", + [ + ([True, pd.NA], True, True, True, pd.NA), + ([False, pd.NA], False, False, pd.NA, False), + ([pd.NA], False, True, pd.NA, pd.NA), + ([], False, True, False, True), + ], +) +def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip): + # the methods return numpy scalars + exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any) + exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all) + exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip) + exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip) + + for con in [pd.array, pd.Series]: + a = con(values, dtype="boolean") + assert a.any() is exp_any + assert a.all() is exp_all + assert a.any(skipna=False) is exp_any_noskip + assert a.all(skipna=False) is exp_all_noskip + + assert np.any(a.any()) is exp_any + assert np.all(a.all()) is exp_all + + +# TODO when BooleanArray coerces to object dtype numpy array, need to do conversion +# manually in the indexing code +# def test_indexing_boolean_mask(): +# arr = pd.array([1, 2, 3, 4], dtype="Int64") +# mask = pd.array([True, False, True, False], dtype="boolean") +# result = arr[mask] +# expected = pd.array([1, 3], dtype="Int64") +# tm.assert_extension_array_equal(result, expected) + +# # missing values -> error +# mask = pd.array([True, False, True, None], dtype="boolean") +# with pytest.raises(IndexError): +# result = arr[mask] + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + + # TODO use to_numpy(na_value=None) here + data_object = np.array(data, dtype=object) + data_object[data.isna()] = None + expected = pa.array(data_object, type=pa.bool_(), from_pandas=True) + assert arr.equals(expected) + + +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + data = pd.array([True, False, None], dtype="boolean") + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "bool" + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.BooleanDtype) + tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 117a19acbfc3a..fa45db93c6102 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,13 +4,14 @@ import pytest from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.util.testing as tm # TODO: more freq variants @@ -40,8 +41,8 @@ def datetime_index(request): """ freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates, timezones - pi = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) - return pi + dti = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) + return dti @pytest.fixture @@ -57,7 +58,7 @@ def timedelta_index(request): class SharedTests: - index_cls = None # type: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] + index_cls: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific @@ -225,6 +226,19 @@ def test_setitem_raises(self): with pytest.raises(TypeError, match="'value' should be a.* 'object'"): arr[0] = object() + def test_inplace_arithmetic(self): + # GH#24115 check that iadd and isub are actually in-place + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + expected = arr + pd.Timedelta(days=1) + arr += pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + + expected = arr - pd.Timedelta(days=1) + arr -= pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex @@ -473,7 +487,15 @@ def test_strftime(self, datetime_index): arr = DatetimeArray(datetime_index) result = arr.strftime("%Y %b") - expected = np.array(datetime_index.strftime("%Y %b")) + expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = DatetimeArray(DatetimeIndex(["2019-01-01", pd.NaT])) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -679,7 +701,15 @@ def test_strftime(self, period_index): arr = PeriodArray(period_index) result = arr.strftime("%Y") - expected = np.array(period_index.strftime("%Y")) + expected = np.array([per.strftime("%Y") for per in arr], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_strftime_nat(self): + # GH 29578 + arr = PeriodArray(PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]")) + + result = arr.strftime("%Y-%m-%d") + expected = np.array(["2019-01-01", np.nan], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -729,3 +759,38 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): for nat in non_casting_nats: with pytest.raises(TypeError): array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c3cda22497ecb..5608ab5fbd9db 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -9,9 +9,9 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import sequence_to_dt64ns -import pandas.util.testing as tm class TestDatetimeArrayConstructor: @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - DatetimeArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim @@ -173,7 +173,7 @@ def test_tz_setter_raises(self): def test_setitem_different_tz_raises(self): data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) - with pytest.raises(ValueError, match="None"): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): arr[0] = pd.Timestamp("2000") with pytest.raises(ValueError, match="US/Central"): @@ -282,6 +282,77 @@ def test_array_interface(self): ) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_different_tz(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + if index: + arr = pd.Index(arr) + + expected = arr.searchsorted(arr[2]) + result = arr.searchsorted(arr[2].tz_convert("UTC")) + assert result == expected + + expected = arr.searchsorted(arr[2:6]) + result = arr.searchsorted(arr[2:6].tz_convert("UTC")) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_tzawareness_compat(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + mismatch = arr.tz_localize("Asia/Tokyo") + + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch[0]) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch) + + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr[0]) + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr) + + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.timedelta64("NaT"), + pd.Timedelta(days=2), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10 ** 9, + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestSequenceToDT64NS: def test_tz_dtype_mismatch_raises(self): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 793de66767cc3..0c8980c43c370 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -6,6 +6,7 @@ from pandas.core.dtypes.generic import ABCIndexClass import pandas as pd +import pandas._testing as tm from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar from pandas.core.arrays import IntegerArray, integer_array from pandas.core.arrays.integer import ( @@ -19,7 +20,6 @@ UInt64Dtype, ) from pandas.tests.extension.base import BaseOpsUtil -import pandas.util.testing as tm def make_data(): @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -108,13 +108,19 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series( + data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) + ) tm.assert_series_equal(result, expected) # from int / list @@ -156,10 +162,13 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( @@ -193,7 +202,7 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # to compare properly, we convert the expected # to float, mask to nans and convert infs # if we have uints then we process as uints - # then conert to float + # then convert to float # and we ultimately want to create a IntArray # for comparisons @@ -208,20 +217,27 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -255,21 +271,18 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -339,16 +352,61 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) - def test_pow(self): - # https://github.com/pandas-dev/pandas/issues/22022 - a = integer_array([1, np.nan, np.nan, 1]) - b = integer_array([1, np.nan, 1, np.nan]) + @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) + def test_divide_by_zero(self, zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + if negative: + expected *= -1 + tm.assert_numpy_array_equal(result, expected) + + def test_pow_scalar(self): + a = pd.array([0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # reversed + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + def test_pow_array(self): + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) result = a ** b - expected = pd.core.arrays.integer_array([1, np.nan, np.nan, 1]) + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) tm.assert_extension_array_equal(result, expected) def test_rpow_one_to_na(self): # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 arr = integer_array([np.nan, np.nan]) result = np.array([1.0, 2.0]) ** arr expected = np.array([1.0, np.nan]) @@ -361,10 +419,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -372,22 +430,87 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + def test_compare_to_string(self, any_nullable_int_dtype): + # GH 28930 + s = pd.Series([1, None], dtype=any_nullable_int_dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): + # GH 28930 + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") + + method = getattr(s1, all_compare_operators) + result = method(2) + + method = getattr(s2, all_compare_operators) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA + + self.assert_series_equal(result, expected) class TestCasting: @@ -473,6 +596,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") @@ -502,12 +636,54 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + @pytest.mark.parametrize("in_series", [True, False]) + def test_to_numpy_na_nan(self, in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("in_series", [True, False]) + @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) + def test_to_numpy_dtype(self, dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) + def test_to_numpy_na_raises(self, dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + def test_astype_str(self): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 \n1 1" assert result == expected @@ -523,7 +699,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -686,7 +862,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B @@ -750,7 +926,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." with pytest.raises(ValueError, match=msg): arr.astype("uint32") @@ -825,10 +1001,55 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_roundtrip(data): + # roundtrip possible from arrow 1.0.0 + import pyarrow as pa + + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + result = table.to_pandas() + tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[1, 2, 3, 4, 5, 6, np.nan, np.nan], dtype="Int64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[1, 2, 3, 4, 5, 6], dtype="Int64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 7a150c35fea09..86793c4ec50dd 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -6,9 +6,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.arrays import PandasArray from pandas.core.arrays.numpy_ import PandasDtype -import pandas.util.testing as tm @pytest.fixture( @@ -226,3 +226,25 @@ def test_setitem_no_coercion(): arr = PandasArray(np.array([1, 2, 3])) with pytest.raises(ValueError, match="int"): arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = PandasArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 252f278242fcc..1f4351c7e20ee 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -3,12 +3,13 @@ from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd +import pandas._testing as tm from pandas.core.arrays import PeriodArray, period_array -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # Dtype @@ -323,3 +324,91 @@ def test_min_max_empty(self, skipna): result = arr.max(skipna=skipna) assert result is pd.NaT + + +# ---------------------------------------------------------------------------- +# Arrow interaction + +pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + ], +) +def test_arrow_array(data, freq): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + periods = period_array(data, freq=freq) + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) + + +@pyarrow_skip +def test_arrow_table_roundtrip(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 42e7bee97e671..62cb4766171a4 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm class TestTimedeltaArrayConstructor: @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - TimedeltaArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim @@ -41,13 +41,12 @@ def test_other_type_raises(self): def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? with pytest.raises( - ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]" + ValueError, match=r"category cannot be converted to timedelta64\[ns\]" ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") with pytest.raises( - ValueError, - match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]", + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) @@ -141,6 +140,42 @@ def test_setitem_objects(self, obj): arr[0] = obj assert arr[0] == pd.Timedelta(seconds=1) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.datetime64("NaT"), + pd.Timestamp.now(), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + (np.arange(10) * 24 * 3600 * 10 ** 9).view("datetime64[ns]"), + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize( + "index", + [ + True, + pytest.param( + False, + marks=pytest.mark.xfail( + reason="Raises ValueError instead of TypeError", raises=ValueError + ), + ), + ], + ) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = TimedeltaArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "searchsorted requires compatible dtype or scalar" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestReductions: @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) diff --git a/pandas/tests/base/__init__.py b/pandas/tests/base/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py new file mode 100644 index 0000000000000..0b7274399aafc --- /dev/null +++ b/pandas/tests/base/test_constructors.py @@ -0,0 +1,142 @@ +from datetime import datetime +import sys + +import numpy as np +import pytest + +from pandas.compat import PYPY + +import pandas as pd +from pandas import DataFrame, Index, Series +import pandas._testing as tm +from pandas.core.accessor import PandasDelegate +from pandas.core.base import NoNewAttributesMixin, PandasObject + + +class TestPandasDelegate: + class Delegator: + _properties = ["foo"] + _methods = ["bar"] + + def _set_foo(self, value): + self.foo = value + + def _get_foo(self): + return self.foo + + foo = property(_get_foo, _set_foo, doc="foo property") + + def bar(self, *args, **kwargs): + """ a test bar method """ + pass + + class Delegate(PandasDelegate, PandasObject): + def __init__(self, obj): + self.obj = obj + + def setup_method(self, method): + pass + + def test_invalid_delegation(self): + # these show that in order for the delegation to work + # the _delegate_* methods need to be overridden to not raise + # a TypeError + + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._properties, + typ="property", + ) + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, accessors=self.Delegator._methods, typ="method" + ) + + delegate = self.Delegate(self.Delegator()) + + with pytest.raises(TypeError): + delegate.foo + + with pytest.raises(TypeError): + delegate.foo = 5 + + with pytest.raises(TypeError): + delegate.foo() + + @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") + def test_memory_usage(self): + # Delegate does not implement memory_usage. + # Check that we fall back to in-built `__sizeof__` + # GH 12924 + delegate = self.Delegate(self.Delegator()) + sys.getsizeof(delegate) + + +class TestNoNewAttributesMixin: + def test_mixin(self): + class T(NoNewAttributesMixin): + pass + + t = T() + assert not hasattr(t, "__frozen") + + t.a = "test" + assert t.a == "test" + + t._freeze() + assert "__frozen" in dir(t) + assert getattr(t, "__frozen") + + with pytest.raises(AttributeError): + t.b = "test" + + assert not hasattr(t, "b") + + +class TestConstruction: + # test certain constructor behaviours on dtype inference across Series, + # Index and DataFrame + + @pytest.mark.parametrize( + "klass", + [ + Series, + lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], + pytest.param( + lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail + ), + Index, + ], + ) + @pytest.mark.parametrize( + "a", + [ + np.array(["2263-01-01"], dtype="datetime64[D]"), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64("2263-01-01", "D")], dtype=object), + np.array(["2263-01-01"], dtype=object), + ], + ids=[ + "datetime64[D]", + "object-datetime.datetime", + "object-numpy-scalar", + "object-string", + ], + ) + def test_constructor_datetime_outofbound(self, a, klass): + # GH-26853 (+ bug GH-26206 out of bound non-ns unit) + + # No dtype specified (dtype inference) + # datetime64[non-ns] raise error, other cases result in object dtype + # and preserve original data + if a.dtype.kind == "M": + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a) + else: + result = klass(a) + assert result.dtype == "object" + tm.assert_numpy_array_equal(result.to_numpy(), a) + + # Explicit dtype specified + # Forced conversion fails for all -> all cases raise error + with pytest.raises(pd.errors.OutOfBoundsDatetime): + klass(a, dtype="datetime64[ns]") diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py new file mode 100644 index 0000000000000..07a15d0619bb6 --- /dev/null +++ b/pandas/tests/base/test_conversion.py @@ -0,0 +1,439 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_datetime64_dtype, is_timedelta64_dtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas import CategoricalIndex, Series, Timedelta, Timestamp +import pandas._testing as tm +from pandas.core.arrays import ( + DatetimeArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) + + +class TestToIterable: + # test that we convert an iterable to python types + + dtypes = [ + ("int8", int), + ("int16", int), + ("int32", int), + ("int64", int), + ("uint8", int), + ("uint16", int), + ("uint32", int), + ("uint64", int), + ("float16", float), + ("float32", float), + ("float64", float), + ("datetime64[ns]", Timestamp), + ("datetime64[ns, US/Eastern]", Timestamp), + ("timedelta64[ns]", Timedelta), + ] + + @pytest.mark.parametrize("dtype, rdtype", dtypes) + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") + # TODO(GH-24559): Remove the filterwarnings + def test_iterable(self, index_or_series, method, dtype, rdtype): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([1], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + "dtype, rdtype, obj", + [ + ("object", object, "a"), + ("object", int, 1), + ("category", object, "a"), + ("category", int, 1), + ], + ) + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + def test_iterable_object_and_category( + self, index_or_series, method, dtype, rdtype, obj + ): + # gh-10904 + # gh-13258 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([obj], dtype=dtype) + result = method(s)[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize("dtype, rdtype", dtypes) + def test_iterable_items(self, dtype, rdtype): + # gh-13258 + # test if items yields the correct boxed scalars + # this only applies to series + s = Series([1], dtype=dtype) + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + _, result = list(s.items())[0] + assert isinstance(result, rdtype) + + @pytest.mark.parametrize( + "dtype, rdtype", dtypes + [("object", int), ("category", int)] + ) + @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") + # TODO(GH-24559): Remove the filterwarnings + def test_iterable_map(self, index_or_series, dtype, rdtype): + # gh-13236 + # coerce iteration to underlying python / pandas types + typ = index_or_series + s = typ([1], dtype=dtype) + result = s.map(type)[0] + if not isinstance(rdtype, tuple): + rdtype = tuple([rdtype]) + assert result in rdtype + + @pytest.mark.parametrize( + "method", + [ + lambda x: x.tolist(), + lambda x: x.to_list(), + lambda x: list(x), + lambda x: list(x.__iter__()), + ], + ids=["tolist", "to_list", "list", "iter"], + ) + def test_categorial_datetimelike(self, method): + i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) + + result = method(i)[0] + assert isinstance(result, Timestamp) + + def test_iter_box(self): + vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] + s = Series(vals) + assert s.dtype == "datetime64[ns]" + for res, exp in zip(s, vals): + assert isinstance(res, Timestamp) + assert res.tz is None + assert res == exp + + vals = [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ] + s = Series(vals) + + assert s.dtype == "datetime64[ns, US/Eastern]" + for res, exp in zip(s, vals): + assert isinstance(res, Timestamp) + assert res.tz == exp.tz + assert res == exp + + # timedelta + vals = [Timedelta("1 days"), Timedelta("2 days")] + s = Series(vals) + assert s.dtype == "timedelta64[ns]" + for res, exp in zip(s, vals): + assert isinstance(res, Timedelta) + assert res == exp + + # period + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] + s = Series(vals) + assert s.dtype == "Period[M]" + for res, exp in zip(s, vals): + assert isinstance(res, pd.Period) + assert res.freq == "M" + assert res == exp + + +@pytest.mark.parametrize( + "array, expected_type, dtype", + [ + (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), + (np.array(["a", "b"]), np.ndarray, "object"), + (pd.Categorical(["a", "b"]), pd.Categorical, "category"), + ( + pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), + DatetimeArray, + "datetime64[ns, US/Central]", + ), + ( + pd.PeriodIndex([2018, 2019], freq="A"), + PeriodArray, + pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + ), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), + # This test is currently failing for datetime64[ns] and timedelta64[ns]. + # The NumPy type system is sufficient for representing these types, so + # we just use NumPy for Series / DataFrame columns of these types (so + # we get consolidation and so on). + # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray + # abstraction to for code reuse. + # At the moment, we've judged that allowing this test to fail is more + # practical that overriding Series._values to special case + # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. + pytest.param( + pd.DatetimeIndex(["2017", "2018"]), + np.ndarray, + "datetime64[ns]", + marks=[pytest.mark.xfail(reason="datetime _values", strict=True)], + ), + pytest.param( + pd.TimedeltaIndex([10 ** 10]), + np.ndarray, + "m8[ns]", + marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)], + ), + ], +) +def test_values_consistent(array, expected_type, dtype): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + tm.assert_equal(l_values, r_values) + + +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), + (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"]), + np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), + ), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), + np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), + ), + (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), + ( + pd.PeriodIndex(["2017", "2018"], freq="D"), + np.array([17167, 17532], dtype=np.int64), + ), + ], +) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) + + +@pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) +def test_numpy_array(arr): + ser = pd.Series(arr) + result = ser.array + expected = PandasArray(arr) + tm.assert_extension_array_equal(result, expected) + + +def test_numpy_array_all_dtypes(any_numpy_dtype): + ser = pd.Series(dtype=any_numpy_dtype) + result = ser.array + if is_datetime64_dtype(any_numpy_dtype): + assert isinstance(result, DatetimeArray) + elif is_timedelta64_dtype(any_numpy_dtype): + assert isinstance(result, TimedeltaArray) + else: + assert isinstance(result, PandasArray) + + +@pytest.mark.parametrize( + "array, attr", + [ + (pd.Categorical(["a", "b"]), "_codes"), + (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), + (pd.core.arrays.integer_array([0, np.nan]), "_data"), + (IntervalArray.from_breaks([0, 1]), "_left"), + (SparseArray([0, 1]), "_sparse_values"), + (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), + # tz-aware Datetime + ( + DatetimeArray( + np.array( + ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + "_data", + ), + ], +) +def test_array(array, attr, index_or_series): + box = index_or_series + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: + pytest.skip(f"No index type for {array.dtype}") + result = box(array, copy=False).array + + if attr: + array = getattr(array, attr) + result = getattr(result, attr) + + assert result is array + + +def test_array_multiindex_raises(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b"]]) + with pytest.raises(ValueError, match="MultiIndex"): + idx.array + + +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + ( + pd.core.arrays.period_array(["2000", "2001"], freq="D"), + np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + ), + ( + pd.core.arrays.integer_array([0, np.nan]), + np.array([0, pd.NA], dtype=object), + ), + ( + IntervalArray.from_breaks([0, 1, 2]), + np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + ), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + # tz-naive datetime + ( + DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + np.array(["2000", "2001"], dtype="M8[ns]"), + ), + # tz-aware stays tz`-aware + ( + DatetimeArray( + np.array( + ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + np.array( + [ + pd.Timestamp("2000-01-01", tz="US/Central"), + pd.Timestamp("2000-01-02", tz="US/Central"), + ] + ), + ), + # Timedelta + ( + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + np.array([0, 3600000000000], dtype="m8[ns]"), + ), + ], +) +def test_to_numpy(array, expected, index_or_series): + box = index_or_series + thing = box(array) + + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: + pytest.skip(f"No index type for {array.dtype}") + + result = thing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] +) +def test_to_numpy_copy(arr, as_series): + obj = pd.Index(arr, copy=False) + if as_series: + obj = pd.Series(obj.values, copy=False) + + # no copy by default + result = obj.to_numpy() + assert np.shares_memory(arr, result) is True + + result = obj.to_numpy(copy=False) + assert np.shares_memory(arr, result) is True + + # copy=True + result = obj.to_numpy(copy=True) + assert np.shares_memory(arr, result) is False + + +@pytest.mark.parametrize("as_series", [True, False]) +def test_to_numpy_dtype(as_series): + tz = "US/Eastern" + obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) + if as_series: + obj = pd.Series(obj) + + # preserve tz by default + result = obj.to_numpy() + expected = np.array( + [pd.Timestamp("2000", tz=tz), pd.Timestamp("2001", tz=tz)], dtype=object + ) + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy(dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy(dtype="M8[ns]") + expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/pandas/tests/test_base.py b/pandas/tests/base/test_ops.py similarity index 60% rename from pandas/tests/test_base.py rename to pandas/tests/base/test_ops.py index 1f19f58e80f26..2693eb12dda71 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/base/test_ops.py @@ -1,6 +1,5 @@ from datetime import datetime, timedelta from io import StringIO -import re import sys import numpy as np @@ -14,14 +13,11 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype, - is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, DatetimeIndex, Index, @@ -33,127 +29,8 @@ TimedeltaIndex, Timestamp, ) -from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray -from pandas.core.base import NoNewAttributesMixin, PandasObject +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm - - -class CheckStringMixin: - def test_string_methods_dont_fail(self): - repr(self.container) - str(self.container) - bytes(self.container) - - def test_tricky_container(self): - if not hasattr(self, "unicode_container"): - pytest.skip("Need unicode_container to test with this") - repr(self.unicode_container) - str(self.unicode_container) - - -class CheckImmutable: - mutable_regex = re.compile("does not support mutable operations") - - def check_mutable_error(self, *args, **kwargs): - # Pass whatever function you normally would to pytest.raises - # (after the Exception kind). - with pytest.raises(TypeError): - self.mutable_regex(*args, **kwargs) - - def test_no_mutable_funcs(self): - def setitem(): - self.container[0] = 5 - - self.check_mutable_error(setitem) - - def setslice(): - self.container[1:2] = 3 - - self.check_mutable_error(setslice) - - def delitem(): - del self.container[0] - - self.check_mutable_error(delitem) - - def delslice(): - del self.container[0:3] - - self.check_mutable_error(delslice) - mutable_methods = getattr(self, "mutable_methods", []) - - for meth in mutable_methods: - self.check_mutable_error(getattr(self.container, meth)) - - def test_slicing_maintains_type(self): - result = self.container[1:2] - expected = self.lst[1:2] - self.check_result(result, expected) - - def check_result(self, result, expected, klass=None): - klass = klass or self.klass - assert isinstance(result, klass) - assert result == expected - - -class TestPandasDelegate: - class Delegator: - _properties = ["foo"] - _methods = ["bar"] - - def _set_foo(self, value): - self.foo = value - - def _get_foo(self): - return self.foo - - foo = property(_get_foo, _set_foo, doc="foo property") - - def bar(self, *args, **kwargs): - """ a test bar method """ - pass - - class Delegate(PandasDelegate, PandasObject): - def __init__(self, obj): - self.obj = obj - - def setup_method(self, method): - pass - - def test_invalid_delegation(self): - # these show that in order for the delegation to work - # the _delegate_* methods need to be overridden to not raise - # a TypeError - - self.Delegate._add_delegate_accessors( - delegate=self.Delegator, - accessors=self.Delegator._properties, - typ="property", - ) - self.Delegate._add_delegate_accessors( - delegate=self.Delegator, accessors=self.Delegator._methods, typ="method" - ) - - delegate = self.Delegate(self.Delegator()) - - with pytest.raises(TypeError): - delegate.foo - - with pytest.raises(TypeError): - delegate.foo = 5 - - with pytest.raises(TypeError): - delegate.foo() - - @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") - def test_memory_usage(self): - # Delegate does not implement memory_usage. - # Check that we fall back to in-built `__sizeof__` - # GH 12924 - delegate = self.Delegate(self.Delegator()) - sys.getsizeof(delegate) class Ops: @@ -179,14 +56,14 @@ def setup_method(self, method): self.int_series = Series(arr, index=self.int_index, name="a") self.float_series = Series(arr, index=self.float_index, name="a") self.dt_series = Series(arr, index=self.dt_index, name="a") - self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) + self.dt_tz_series = self.dt_tz_index.to_series() self.period_series = Series(arr, index=self.period_index, name="a") self.string_series = Series(arr, index=self.string_index, name="a") self.unicode_series = Series(arr, index=self.unicode_index, name="a") types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] - self.indexes = [getattr(self, "{}_index".format(t)) for t in types] - self.series = [getattr(self, "{}_series".format(t)) for t in types] + self.indexes = [getattr(self, f"{t}_index") for t in types] + self.series = [getattr(self, f"{t}_series") for t in types] # To test narrow dtypes, we use narrower *data* elements, not *index* elements index = self.int_index @@ -202,7 +79,7 @@ def setup_method(self, method): self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] - self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types] + self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] self.objs = self.indexes + self.series + self.narrow_series @@ -276,6 +153,28 @@ def test_binary_ops_docs(self, klass): assert expected_str in getattr(klass, "r" + op_name).__doc__ +class TestTranspose(Ops): + errmsg = "the 'axes' parameter is not supported" + + def test_transpose(self): + for obj in self.objs: + tm.assert_equal(obj.transpose(), obj) + + def test_transpose_non_default_axes(self): + for obj in self.objs: + with pytest.raises(ValueError, match=self.errmsg): + obj.transpose(1) + with pytest.raises(ValueError, match=self.errmsg): + obj.transpose(axes=1) + + def test_numpy_transpose(self): + for obj in self.objs: + tm.assert_equal(np.transpose(obj), obj) + + with pytest.raises(ValueError, match=self.errmsg): + np.transpose(obj, axes=1) + + class TestIndexOps(Ops): def setup_method(self, method): super().setup_method(method) @@ -333,31 +232,17 @@ def test_ndarray_compat_properties(self): assert getattr(o, p, None) is not None # deprecated properties - for p in ["flags", "strides", "itemsize"]: - with tm.assert_produces_warning(FutureWarning): - assert getattr(o, p, None) is not None - - with tm.assert_produces_warning(FutureWarning): - assert hasattr(o, "base") - - # If we have a datetime-like dtype then needs a view to work - # but the user is responsible for that - try: - with tm.assert_produces_warning(FutureWarning): - assert o.data is not None - except ValueError: - pass + for p in ["flags", "strides", "itemsize", "base", "data"]: + assert not hasattr(o, p) with pytest.raises(ValueError): - with tm.assert_produces_warning(FutureWarning): - o.item() # len > 1 + o.item() # len > 1 assert o.ndim == 1 assert o.size == len(o) - with tm.assert_produces_warning(FutureWarning): - assert Index([1]).item() == 1 - assert Series([1]).item() == 1 + assert Index([1]).item() == 1 + assert Series([1]).item() == 1 def test_value_counts_unique_nunique(self): for orig in self.objs: @@ -400,7 +285,7 @@ def test_value_counts_unique_nunique(self): result = o.unique() if isinstance(o, Index): - assert isinstance(result, o.__class__) + assert isinstance(result, type(o)) tm.assert_index_equal(result, orig) assert result.dtype == orig.dtype elif is_datetime64tz_dtype(o): @@ -516,8 +401,8 @@ def test_value_counts_unique_nunique_null(self, null_obj): assert o.nunique() == 8 assert o.nunique(dropna=False) == 9 - @pytest.mark.parametrize("klass", [Index, Series]) - def test_value_counts_inferred(self, klass): + def test_value_counts_inferred(self, index_or_series): + klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) @@ -547,8 +432,8 @@ def test_value_counts_inferred(self, klass): expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) - @pytest.mark.parametrize("klass", [Index, Series]) - def test_value_counts_bins(self, klass): + def test_value_counts_bins(self, index_or_series): + klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -601,7 +486,7 @@ def test_value_counts_bins(self, klass): tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 - s = klass({}) + s = klass({}) if klass is dict else klass({}, dtype=object) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original @@ -612,8 +497,8 @@ def test_value_counts_bins(self, klass): assert s.nunique() == 0 - @pytest.mark.parametrize("klass", [Index, Series]) - def test_value_counts_datetime64(self, klass): + def test_value_counts_datetime64(self, index_or_series): + klass = index_or_series # GH 3002, datetime64[ns] # don't test names though @@ -653,7 +538,7 @@ def test_value_counts_datetime64(self, klass): # with NaT s = df["dt"].copy() - s = klass([v for v in s.values] + [pd.NaT]) + s = klass(list(s.values) + [pd.NaT]) result = s.value_counts() assert result.index.dtype == "datetime64[ns]" @@ -707,9 +592,9 @@ def test_factorize(self): else: exp_arr = np.array(range(len(o)), dtype=np.intp) exp_uniques = o - labels, uniques = o.factorize() + codes, uniques = o.factorize() - tm.assert_numpy_array_equal(labels, exp_arr) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): tm.assert_index_equal(uniques, Index(orig), check_names=False) else: @@ -736,9 +621,9 @@ def test_factorize_repeated(self): exp_arr = np.array( [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp ) - labels, uniques = n.factorize(sort=True) + codes, uniques = n.factorize(sort=True) - tm.assert_numpy_array_equal(labels, exp_arr) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): tm.assert_index_equal( uniques, Index(orig).sort_values(), check_names=False @@ -747,8 +632,8 @@ def test_factorize_repeated(self): tm.assert_index_equal(uniques, o, check_names=False) exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) - labels, uniques = n.factorize(sort=False) - tm.assert_numpy_array_equal(labels, exp_arr) + codes, uniques = n.factorize(sort=False) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): expected = Index(o.iloc[5:10].append(o.iloc[:5])) @@ -813,9 +698,7 @@ def test_duplicated_drop_duplicates_index(self): with pytest.raises( TypeError, - match=( - r"drop_duplicates\(\) got an " r"unexpected keyword argument" - ), + match=r"drop_duplicates\(\) got an unexpected keyword argument", ): idx.drop_duplicates(inplace=True) @@ -1014,487 +897,3 @@ def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) - - -class TestTranspose(Ops): - errmsg = "the 'axes' parameter is not supported" - - def test_transpose(self): - for obj in self.objs: - tm.assert_equal(obj.transpose(), obj) - - def test_transpose_non_default_axes(self): - for obj in self.objs: - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(1) - with pytest.raises(ValueError, match=self.errmsg): - obj.transpose(axes=1) - - def test_numpy_transpose(self): - for obj in self.objs: - tm.assert_equal(np.transpose(obj), obj) - - with pytest.raises(ValueError, match=self.errmsg): - np.transpose(obj, axes=1) - - -class TestNoNewAttributesMixin: - def test_mixin(self): - class T(NoNewAttributesMixin): - pass - - t = T() - assert not hasattr(t, "__frozen") - - t.a = "test" - assert t.a == "test" - - t._freeze() - assert "__frozen" in dir(t) - assert getattr(t, "__frozen") - - with pytest.raises(AttributeError): - t.b = "test" - - assert not hasattr(t, "b") - - -class TestToIterable: - # test that we convert an iterable to python types - - dtypes = [ - ("int8", int), - ("int16", int), - ("int32", int), - ("int64", int), - ("uint8", int), - ("uint16", int), - ("uint32", int), - ("uint64", int), - ("float16", float), - ("float32", float), - ("float64", float), - ("datetime64[ns]", Timestamp), - ("datetime64[ns, US/Eastern]", Timestamp), - ("timedelta64[ns]", Timedelta), - ] - - @pytest.mark.parametrize("dtype, rdtype", dtypes) - @pytest.mark.parametrize( - "method", - [ - lambda x: x.tolist(), - lambda x: x.to_list(), - lambda x: list(x), - lambda x: list(x.__iter__()), - ], - ids=["tolist", "to_list", "list", "iter"], - ) - @pytest.mark.parametrize("typ", [Series, Index]) - @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") - # TODO(GH-24559): Remove the filterwarnings - def test_iterable(self, typ, method, dtype, rdtype): - # gh-10904 - # gh-13258 - # coerce iteration to underlying python / pandas types - s = typ([1], dtype=dtype) - result = method(s)[0] - assert isinstance(result, rdtype) - - @pytest.mark.parametrize( - "dtype, rdtype, obj", - [ - ("object", object, "a"), - ("object", int, 1), - ("category", object, "a"), - ("category", int, 1), - ], - ) - @pytest.mark.parametrize( - "method", - [ - lambda x: x.tolist(), - lambda x: x.to_list(), - lambda x: list(x), - lambda x: list(x.__iter__()), - ], - ids=["tolist", "to_list", "list", "iter"], - ) - @pytest.mark.parametrize("typ", [Series, Index]) - def test_iterable_object_and_category(self, typ, method, dtype, rdtype, obj): - # gh-10904 - # gh-13258 - # coerce iteration to underlying python / pandas types - s = typ([obj], dtype=dtype) - result = method(s)[0] - assert isinstance(result, rdtype) - - @pytest.mark.parametrize("dtype, rdtype", dtypes) - def test_iterable_items(self, dtype, rdtype): - # gh-13258 - # test if items yields the correct boxed scalars - # this only applies to series - s = Series([1], dtype=dtype) - _, result = list(s.items())[0] - assert isinstance(result, rdtype) - - _, result = list(s.items())[0] - assert isinstance(result, rdtype) - - @pytest.mark.parametrize( - "dtype, rdtype", dtypes + [("object", int), ("category", int)] - ) - @pytest.mark.parametrize("typ", [Series, Index]) - @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") - # TODO(GH-24559): Remove the filterwarnings - def test_iterable_map(self, typ, dtype, rdtype): - # gh-13236 - # coerce iteration to underlying python / pandas types - s = typ([1], dtype=dtype) - result = s.map(type)[0] - if not isinstance(rdtype, tuple): - rdtype = tuple([rdtype]) - assert result in rdtype - - @pytest.mark.parametrize( - "method", - [ - lambda x: x.tolist(), - lambda x: x.to_list(), - lambda x: list(x), - lambda x: list(x.__iter__()), - ], - ids=["tolist", "to_list", "list", "iter"], - ) - def test_categorial_datetimelike(self, method): - i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) - - result = method(i)[0] - assert isinstance(result, Timestamp) - - def test_iter_box(self): - vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - for res, exp in zip(s, vals): - assert isinstance(res, Timestamp) - assert res.tz is None - assert res == exp - - vals = [ - Timestamp("2011-01-01", tz="US/Eastern"), - Timestamp("2011-01-02", tz="US/Eastern"), - ] - s = Series(vals) - - assert s.dtype == "datetime64[ns, US/Eastern]" - for res, exp in zip(s, vals): - assert isinstance(res, Timestamp) - assert res.tz == exp.tz - assert res == exp - - # timedelta - vals = [Timedelta("1 days"), Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - for res, exp in zip(s, vals): - assert isinstance(res, Timedelta) - assert res == exp - - # period - vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] - s = Series(vals) - assert s.dtype == "Period[M]" - for res, exp in zip(s, vals): - assert isinstance(res, pd.Period) - assert res.freq == "M" - assert res == exp - - -@pytest.mark.parametrize( - "array, expected_type, dtype", - [ - (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), - (np.array(["a", "b"]), np.ndarray, "object"), - (pd.Categorical(["a", "b"]), pd.Categorical, "category"), - ( - pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), - DatetimeArray, - "datetime64[ns, US/Central]", - ), - ( - pd.PeriodIndex([2018, 2019], freq="A"), - pd.core.arrays.PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), - ), - ( - pd.IntervalIndex.from_breaks([0, 1, 2]), - pd.core.arrays.IntervalArray, - "interval", - ), - # This test is currently failing for datetime64[ns] and timedelta64[ns]. - # The NumPy type system is sufficient for representing these types, so - # we just use NumPy for Series / DataFrame columns of these types (so - # we get consolidation and so on). - # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray - # abstraction to for code reuse. - # At the moment, we've judged that allowing this test to fail is more - # practical that overriding Series._values to special case - # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. - pytest.param( - pd.DatetimeIndex(["2017", "2018"]), - np.ndarray, - "datetime64[ns]", - marks=[pytest.mark.xfail(reason="datetime _values", strict=True)], - ), - pytest.param( - pd.TimedeltaIndex([10 ** 10]), - np.ndarray, - "m8[ns]", - marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)], - ), - ], -) -def test_values_consistent(array, expected_type, dtype): - l_values = pd.Series(array)._values - r_values = pd.Index(array)._values - assert type(l_values) is expected_type - assert type(l_values) is type(r_values) - - tm.assert_equal(l_values, r_values) - - -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), - (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"]), - np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), - ), - ( - pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), - np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), - ), - (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), - ( - pd.PeriodIndex(["2017", "2018"], freq="D"), - np.array([17167, 17532], dtype=np.int64), - ), - ], -) -def test_ndarray_values(array, expected): - l_values = pd.Series(array)._ndarray_values - r_values = pd.Index(array)._ndarray_values - tm.assert_numpy_array_equal(l_values, r_values) - tm.assert_numpy_array_equal(l_values, expected) - - -@pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) -def test_numpy_array(arr): - ser = pd.Series(arr) - result = ser.array - expected = PandasArray(arr) - tm.assert_extension_array_equal(result, expected) - - -def test_numpy_array_all_dtypes(any_numpy_dtype): - ser = pd.Series(dtype=any_numpy_dtype) - result = ser.array - if is_datetime64_dtype(any_numpy_dtype): - assert isinstance(result, DatetimeArray) - elif is_timedelta64_dtype(any_numpy_dtype): - assert isinstance(result, TimedeltaArray) - else: - assert isinstance(result, PandasArray) - - -@pytest.mark.parametrize( - "array, attr", - [ - (pd.Categorical(["a", "b"]), "_codes"), - (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), - (pd.core.arrays.integer_array([0, np.nan]), "_data"), - (pd.core.arrays.IntervalArray.from_breaks([0, 1]), "_left"), - (pd.SparseArray([0, 1]), "_sparse_values"), - (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), - # tz-aware Datetime - ( - DatetimeArray( - np.array( - ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" - ), - dtype=DatetimeTZDtype(tz="US/Central"), - ), - "_data", - ), - ], -) -@pytest.mark.parametrize("box", [pd.Series, pd.Index]) -def test_array(array, attr, box): - if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) - result = box(array, copy=False).array - - if attr: - array = getattr(array, attr) - result = getattr(result, attr) - - assert result is array - - -def test_array_multiindex_raises(): - idx = pd.MultiIndex.from_product([["A"], ["a", "b"]]) - with pytest.raises(ValueError, match="MultiIndex"): - idx.array - - -@pytest.mark.parametrize( - "array, expected", - [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), - ( - pd.core.arrays.period_array(["2000", "2001"], freq="D"), - np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), - ), - ( - pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), - ), - ( - pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), - np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), - ), - (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), - # tz-naive datetime - ( - DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), - np.array(["2000", "2001"], dtype="M8[ns]"), - ), - # tz-aware stays tz`-aware - ( - DatetimeArray( - np.array( - ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" - ), - dtype=DatetimeTZDtype(tz="US/Central"), - ), - np.array( - [ - pd.Timestamp("2000-01-01", tz="US/Central"), - pd.Timestamp("2000-01-02", tz="US/Central"), - ] - ), - ), - # Timedelta - ( - TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), - np.array([0, 3600000000000], dtype="m8[ns]"), - ), - ], -) -@pytest.mark.parametrize("box", [pd.Series, pd.Index]) -def test_to_numpy(array, expected, box): - thing = box(array) - - if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) - - result = thing.to_numpy() - tm.assert_numpy_array_equal(result, expected) - - -@pytest.mark.parametrize("as_series", [True, False]) -@pytest.mark.parametrize( - "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] -) -def test_to_numpy_copy(arr, as_series): - obj = pd.Index(arr, copy=False) - if as_series: - obj = pd.Series(obj.values, copy=False) - - # no copy by default - result = obj.to_numpy() - assert np.shares_memory(arr, result) is True - - result = obj.to_numpy(copy=False) - assert np.shares_memory(arr, result) is True - - # copy=True - result = obj.to_numpy(copy=True) - assert np.shares_memory(arr, result) is False - - -@pytest.mark.parametrize("as_series", [True, False]) -def test_to_numpy_dtype(as_series): - tz = "US/Eastern" - obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) - if as_series: - obj = pd.Series(obj) - - # preserve tz by default - result = obj.to_numpy() - expected = np.array( - [pd.Timestamp("2000", tz=tz), pd.Timestamp("2001", tz=tz)], dtype=object - ) - tm.assert_numpy_array_equal(result, expected) - - result = obj.to_numpy(dtype="object") - tm.assert_numpy_array_equal(result, expected) - - result = obj.to_numpy(dtype="M8[ns]") - expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") - tm.assert_numpy_array_equal(result, expected) - - -class TestConstruction: - # test certain constructor behaviours on dtype inference across Series, - # Index and DataFrame - - @pytest.mark.parametrize( - "klass", - [ - Series, - lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], - pytest.param( - lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail - ), - Index, - ], - ) - @pytest.mark.parametrize( - "a", - [ - np.array(["2263-01-01"], dtype="datetime64[D]"), - np.array([datetime(2263, 1, 1)], dtype=object), - np.array([np.datetime64("2263-01-01", "D")], dtype=object), - np.array(["2263-01-01"], dtype=object), - ], - ids=[ - "datetime64[D]", - "object-datetime.datetime", - "object-numpy-scalar", - "object-string", - ], - ) - def test_constructor_datetime_outofbound(self, a, klass): - # GH-26853 (+ bug GH-26206 out of bound non-ns unit) - - # No dtype specified (dtype inference) - # datetime64[non-ns] raise error, other cases result in object dtype - # and preserve original data - if a.dtype.kind == "M": - with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a) - else: - result = klass(a) - assert result.dtype == "object" - tm.assert_numpy_array_equal(result.to_numpy(), a) - - # Explicit dtype specified - # Forced conversion fails for all -> all cases raise error - with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a, dtype="datetime64[ns]") diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index a075521b67561..7f68abb92ba43 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -16,6 +16,7 @@ import pandas as pd from pandas import DataFrame, Series, compat, date_range +import pandas._testing as tm from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -33,7 +34,6 @@ _special_case_arith_ops_syms, _unary_math_ops, ) -import pandas.util.testing as tm @pytest.fixture( @@ -42,10 +42,8 @@ engine, marks=pytest.mark.skipif( engine == "numexpr" and not _USE_NUMEXPR, - reason="numexpr enabled->{enabled}, " - "installed->{installed}".format( - enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED - ), + reason=f"numexpr enabled->{_USE_NUMEXPR}, " + f"installed->{_NUMEXPR_INSTALLED}", ), ) for engine in _engines @@ -189,9 +187,7 @@ def test_complex_cmp_ops(self, cmp1, cmp2): rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) - ex = "(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)".format( - cmp1=cmp1, binop=binop, cmp2=cmp2 - ) + ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)" result = pd.eval(ex, engine=self.engine, parser=self.parser) self.check_equal(result, expected) @@ -265,9 +261,9 @@ def check_operands(left, right, cmp_op): rhs_new = check_operands(mid, rhs, cmp2) if lhs_new is not None and rhs_new is not None: - ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) - ex2 = "lhs {0} mid and mid {1} rhs".format(cmp1, cmp2) - ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp1, cmp2) + ex1 = f"lhs {cmp1} mid {cmp2} rhs" + ex2 = f"lhs {cmp1} mid and mid {cmp2} rhs" + ex3 = f"(lhs {cmp1} mid) & (mid {cmp2} rhs)" expected = _eval_single_bin(lhs_new, "&", rhs_new, self.engine) for ex in (ex1, ex2, ex3): @@ -276,7 +272,7 @@ def check_operands(left, right, cmp_op): tm.assert_almost_equal(result, expected) def check_simple_cmp_op(self, lhs, cmp1, rhs): - ex = "lhs {0} rhs".format(cmp1) + ex = f"lhs {cmp1} rhs" msg = ( r"only list-like( or dict-like)? objects are allowed to be" r" passed to (DataFrame\.)?isin\(\), you passed a" @@ -297,12 +293,12 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): self.check_equal(result, expected) def check_binary_arith_op(self, lhs, arith1, rhs): - ex = "lhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs" result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) tm.assert_almost_equal(result, expected) - ex = "lhs {0} rhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs {arith1} rhs" result = pd.eval(ex, engine=self.engine, parser=self.parser) nlhs = _eval_single_bin(lhs, arith1, rhs, self.engine) self.check_alignment(result, nlhs, rhs, arith1) @@ -317,25 +313,25 @@ def check_alignment(self, result, nlhs, ghs, op): else: # direct numpy comparison - expected = self.ne.evaluate("nlhs {0} ghs".format(op)) + expected = self.ne.evaluate(f"nlhs {op} ghs") tm.assert_numpy_array_equal(result.values, expected) # modulus, pow, and floor division require special casing def check_modulus(self, lhs, arith1, rhs): - ex = "lhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs" result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs tm.assert_almost_equal(result, expected) - expected = self.ne.evaluate("expected {0} rhs".format(arith1)) + expected = self.ne.evaluate(f"expected {arith1} rhs") if isinstance(result, (DataFrame, Series)): tm.assert_almost_equal(result.values, expected) else: tm.assert_almost_equal(result, expected.item()) def check_floor_division(self, lhs, arith1, rhs): - ex = "lhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs" if self.engine == "python": res = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -343,8 +339,8 @@ def check_floor_division(self, lhs, arith1, rhs): self.check_equal(res, expected) else: msg = ( - r"unsupported operand type\(s\) for //: 'VariableNode' and" - " 'VariableNode'" + r"unsupported operand type\(s\) for //: 'VariableNode' and " + "'VariableNode'" ) with pytest.raises(TypeError, match=msg): pd.eval( @@ -370,7 +366,7 @@ def get_expected_pow_result(self, lhs, rhs): return expected def check_pow(self, lhs, arith1, rhs): - ex = "lhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs" expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -384,7 +380,7 @@ def check_pow(self, lhs, arith1, rhs): else: tm.assert_almost_equal(result, expected) - ex = "(lhs {0} rhs) {0} rhs".format(arith1) + ex = f"(lhs {arith1} rhs) {arith1} rhs" result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = self.get_expected_pow_result( self.get_expected_pow_result(lhs, rhs), rhs @@ -409,7 +405,7 @@ def check_single_invert_op(self, lhs, cmp1, rhs): def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = ["in", "not in"] - ex = "~(lhs {0} rhs)".format(cmp1) + ex = f"~(lhs {cmp1} rhs)" msg = ( r"only list-like( or dict-like)? objects are allowed to be" @@ -443,7 +439,7 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): tm.assert_almost_equal(ev, result) def ex(self, op, var_name="lhs"): - return "{0}{1}".format(op, var_name) + return f"{op}{var_name}" def test_frame_invert(self): expr = self.ex("~") @@ -733,16 +729,16 @@ def test_float_truncation(self): df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) cutoff = 1000000000.0006 - result = df.query("A < {cutoff:.4f}".format(cutoff=cutoff)) + result = df.query(f"A < {cutoff:.4f}") assert result.empty cutoff = 1000000000.0010 - result = df.query("A > {cutoff:.4f}".format(cutoff=cutoff)) + result = df.query(f"A > {cutoff:.4f}") expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = df.query("A == {exact:.4f}".format(exact=exact)) + result = df.query(f"A == {exact:.4f}") expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) @@ -781,7 +777,7 @@ def setup_ops(self): self.unary_ops = "+", "-", "~" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) + ex1 = f"lhs {cmp1} mid {cmp2} rhs" with pytest.raises(NotImplementedError): pd.eval(ex1, engine=self.engine, parser=self.parser) @@ -794,7 +790,7 @@ def setup_class(cls): cls.parser = "python" def check_modulus(self, lhs, arith1, rhs): - ex = "lhs {0} rhs".format(arith1) + ex = f"lhs {arith1} rhs" result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs @@ -811,7 +807,7 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: - expected = eval("nlhs {0} ghs".format(op)) + expected = eval(f"nlhs {op} ghs") tm.assert_almost_equal(result, expected) @@ -840,13 +836,13 @@ class TestTypeCasting: @pytest.mark.parametrize("dt", [np.float32, np.float64]) def test_binop_typecasting(self, engine, parser, op, dt): df = tm.makeCustomDataframe(5, 3, data_gen_f=f, dtype=dt) - s = "df {} 3".format(op) + s = f"df {op} 3" res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt tm.assert_frame_equal(res, eval(s)) - s = "3 {} df".format(op) + s = f"3 {op} df" res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt @@ -1013,8 +1009,8 @@ def test_series_frame_commutativity(self, engine, parser): index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - lhs = "s {0} df".format(op) - rhs = "df {0} s".format(op) + lhs = f"s {op} df" + rhs = f"df {op} s" if should_warn(df.index, s.index): with tm.assert_produces_warning(RuntimeWarning): a = pd.eval(lhs, engine=engine, parser=parser) @@ -1114,11 +1110,11 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): if not is_python_engine: assert len(w) == 1 msg = str(w[0].message) + loged = np.log10(s.size - df.shape[1]) expected = ( - "Alignment difference on axis {0} is larger" - " than an order of magnitude on term {1!r}, " - "by more than {2:.4g}; performance may suffer" - "".format(1, "df", np.log10(s.size - df.shape[1])) + f"Alignment difference on axis 1 is larger " + f"than an order of magnitude on term 'df', " + f"by more than {loged:.4g}; performance may suffer" ) assert msg == expected @@ -1149,9 +1145,9 @@ def test_simple_arith_ops(self): ops = self.arith_ops for op in filter(lambda x: x != "//", ops): - ex = "1 {0} 1".format(op) - ex2 = "x {0} 1".format(op) - ex3 = "1 {0} (x + 1)".format(op) + ex = f"1 {op} 1" + ex2 = f"x {op} 1" + ex3 = f"1 {op} (x + 1)" if op in ("in", "not in"): msg = "argument of type 'int' is not iterable" @@ -1176,7 +1172,7 @@ def test_simple_arith_ops(self): def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): - ex = "{0} {1} {2}".format(lhs, op, rhs) + ex = f"{lhs} {op} {rhs}" res = self.eval(ex) exp = eval(ex) assert res == exp @@ -1185,7 +1181,7 @@ def test_bool_ops_with_constants(self): for op, lhs, rhs in product( expr._bool_ops_syms, ("True", "False"), ("True", "False") ): - ex = "{0} {1} {2}".format(lhs, op, rhs) + ex = f"{lhs} {op} {rhs}" res = self.eval(ex) exp = eval(ex) assert res == exp @@ -1210,25 +1206,33 @@ def test_truediv(self): ex = "s / 1" d = {"s": s} # noqa - res = self.eval(ex, truediv=False) + # FutureWarning: The `truediv` parameter in pd.eval is deprecated and will be + # removed in a future version. + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=False) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval(ex, truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=True) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval("1 / 2", truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=True) expec = 0.5 assert res == expec - res = self.eval("1 / 2", truediv=False) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval("s / 2", truediv=False) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval("s / 2", truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=True) expec = 0.5 assert res == expec @@ -1679,7 +1683,7 @@ def test_bool_ops_with_constants(self): for op, lhs, rhs in product( expr._bool_ops_syms, ("True", "False"), ("True", "False") ): - ex = "{0} {1} {2}".format(lhs, op, rhs) + ex = f"{lhs} {op} {rhs}" if op in ("and", "or"): with pytest.raises(NotImplementedError): self.eval(ex) @@ -1690,7 +1694,7 @@ def test_bool_ops_with_constants(self): def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): - ex = "lhs {0} rhs".format(op) + ex = f"lhs {op} rhs" if op in ("and", "or"): with pytest.raises(NotImplementedError): pd.eval(ex, engine=self.engine, parser=self.parser) @@ -1742,7 +1746,7 @@ def test_unary_functions(self, unary_fns_for_ne): a = df.a for fn in unary_fns_for_ne: - expr = "{0}(a)".format(fn) + expr = f"{fn}(a)" got = self.eval(expr) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) @@ -1750,9 +1754,9 @@ def test_unary_functions(self, unary_fns_for_ne): def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, unary_fns_for_ne): for fn in ("floor", "ceil"): - msg = '"{0}" is not a supported function'.format(fn) + msg = f'"{fn}" is not a supported function' with pytest.raises(ValueError, match=msg): - expr = "{0}(100)".format(fn) + expr = f"{fn}(100)" self.eval(expr) def test_binary_functions(self): @@ -1760,7 +1764,7 @@ def test_binary_functions(self): a = df.a b = df.b for fn in self.binary_fns: - expr = "{0}(a, b)".format(fn) + expr = f"{fn}(a, b)" got = self.eval(expr) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) @@ -1889,11 +1893,11 @@ def test_invalid_parser(): pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers = { +_parsers: Dict[str, Type[BaseExprVisitor]] = { "python": PythonExprVisitor, - "pytables": pytables.ExprVisitor, + "pytables": pytables.PyTablesExprVisitor, "pandas": PandasExprVisitor, -} # type: Dict[str, Type[BaseExprVisitor]] +} @pytest.mark.parametrize("engine", _engines) @@ -1971,9 +1975,9 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): lhs = gen[lhs]() # noqa rhs = gen[rhs]() # noqa - ex1 = "lhs {0} mid {1} rhs".format(cmp, cmp) - ex2 = "lhs {0} mid and mid {1} rhs".format(cmp, cmp) - ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp, cmp) + ex1 = f"lhs {cmp} mid {cmp} rhs" + ex2 = f"lhs {cmp} mid and mid {cmp} rhs" + ex3 = f"(lhs {cmp} mid) & (mid {cmp} rhs)" for ex in (ex1, ex2, ex3): with pytest.raises(NotImplementedError): pd.eval(ex, engine=engine, parser=parser) @@ -1990,7 +1994,7 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): ) def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}) - result = df.eval("A == {}".format(other)) + result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") if _USE_NUMEXPR: # https://github.com/pandas-dev/pandas/issues/10239 @@ -2006,6 +2010,23 @@ def test_inf(engine, parser): assert result == expected +def test_truediv_deprecated(engine, parser): + # GH#29182 + match = "The `truediv` parameter in pd.eval is deprecated" + + with tm.assert_produces_warning(FutureWarning) as m: + pd.eval("1+1", engine=engine, parser=parser, truediv=True) + + assert len(m) == 1 + assert match in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + pd.eval("1+1", engine=engine, parser=parser, truediv=False) + + assert len(m) == 1 + assert match in str(m[0].message) + + def test_negate_lt_eq_le(engine, parser): df = pd.DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) expected = df[~(df.cat > 0)] diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 20a5be0c8a289..e815a90207a08 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -8,6 +8,8 @@ from pandas.compat import is_platform_windows +import pandas as pd + _all_locales = get_locales() or [] _current_locale = locale.getlocale() @@ -56,21 +58,21 @@ def test_get_locales_prefix(): @_skip_if_only_one_locale -def test_set_locale(): +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): if all(x is None for x in _current_locale): # Not sure why, but on some Travis runs with pytest, # getlocale() returned (None, None). pytest.skip("Current locale is not set.") - locale_override = os.environ.get("LOCALE_OVERRIDE", None) - - if locale_override is None: - lang, enc = "it_CH", "UTF-8" - elif locale_override == "C": - lang, enc = "en_US", "ascii" - else: - lang, enc = locale_override.split(".") - enc = codecs.lookup(enc).name new_locale = lang, enc @@ -91,3 +93,13 @@ def test_set_locale(): # Once we exit the "with" statement, locale should be back to what it was. current_locale = locale.getlocale() assert current_locale == _current_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index 71f41fcf5b447..cc823a3d6e02c 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -2,7 +2,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm def test_cast_1d_array_like_from_scalar_categorical(): diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 620e74f80d5fb..fe271392122a2 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.cast import construct_1d_ndarray_preserving_na -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 99afabfa42a04..d6e6ed3022b75 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas import DatetimeIndex, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index bf11b81af6f90..2744cfa8ddc62 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -10,8 +10,16 @@ ) from pandas.core.dtypes.common import is_dtype_equal -from pandas import Categorical, Period, Series, Timedelta, Timestamp, date_range -import pandas.util.testing as tm +from pandas import ( + Categorical, + Interval, + Period, + Series, + Timedelta, + Timestamp, + date_range, +) +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -73,7 +81,7 @@ def test_infer_dtype_from_period(freq, pandas_dtype): dtype, val = infer_dtype_from_scalar(p, pandas_dtype=pandas_dtype) if pandas_dtype: - exp_dtype = "period[{0}]".format(freq) + exp_dtype = f"period[{freq}]" exp_val = p.ordinal else: exp_dtype = np.object_ @@ -97,7 +105,7 @@ def test_infer_from_scalar_tz(tz, pandas_dtype): dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=pandas_dtype) if pandas_dtype: - exp_dtype = "datetime64[ns, {0}]".format(tz) + exp_dtype = f"datetime64[ns, {tz}]" exp_val = dt.value else: exp_dtype = np.object_ @@ -107,6 +115,25 @@ def test_infer_from_scalar_tz(tz, pandas_dtype): assert val == exp_val +@pytest.mark.parametrize( + "left, right, subtype", + [ + (0, 1, "int64"), + (0.0, 1.0, "float64"), + (Timestamp(0), Timestamp(1), "datetime64[ns]"), + (Timestamp(0, tz="UTC"), Timestamp(1, tz="UTC"), "datetime64[ns, UTC]"), + (Timedelta(0), Timedelta(1), "timedelta64[ns]"), + ], +) +def test_infer_from_interval(left, right, subtype, closed, pandas_dtype): + # GH 30337 + interval = Interval(left, right, closed) + result_dtype, result_value = infer_dtype_from_scalar(interval, pandas_dtype) + expected_dtype = f"interval[{subtype}]" if pandas_dtype else np.object_ + assert result_dtype == expected_dtype + assert result_value == interval + + def test_infer_dtype_from_scalar_errors(): msg = "invalid ndarray passed to infer_dtype_from_scalar" diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 5c61574eddb50..69f8f46356a4d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -8,7 +8,6 @@ import pytest from pandas._libs.tslibs import NaT -from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( @@ -19,7 +18,6 @@ is_integer_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -65,42 +63,7 @@ def any_numpy_dtype_reduced(request): return request.param -@pytest.fixture( - params=[(True, None), (True, object), (False, None)], - ids=["True-None", "True-object", "False-None"], -) -def box(request): - """ - Parametrized fixture determining whether/how to transform fill_value. - - Since fill_value is defined on a per-test basis, the actual transformation - (based on this fixture) is executed in _check_promote. - - Returns - ------- - boxed : Boolean - Whether fill_value should be wrapped in an np.array. - box_dtype : dtype - The dtype to pass to np.array([fill_value], dtype=box_dtype). If None, - then this is passed on unmodified, and corresponds to the numpy default - dtype for the given fill_value. - - * (True, None) # fill_value wrapped in array with default dtype - * (True, object) # fill_value wrapped in array with object dtype - * (False, None) # fill_value passed on as scalar - """ - return request.param - - -def _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar=None, - exp_val_for_array=None, -): +def _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar=None): """ Auxiliary function to unify testing of scalar/array promotion. @@ -109,13 +72,8 @@ def _check_promote( dtype : dtype The value to pass on as the first argument to maybe_promote. fill_value : scalar - The value to pass on as the second argument to maybe_promote, either as - a scalar, or boxed into an array (depending on the parameter `boxed`). - boxed : Boolean - Parameter whether fill_value should be passed to maybe_promote - directly, or wrapped in an array (of dtype box_dtype). - box_dtype : dtype - The dtype to enforce when wrapping fill_value into an np.array. + The value to pass on as the second argument to maybe_promote as + a scalar. expected_dtype : dtype The expected dtype returned by maybe_promote (by design this is the same regardless of whether fill_value was passed as a scalar or in an @@ -123,25 +81,14 @@ def _check_promote( exp_val_for_scalar : scalar The expected value for the (potentially upcast) fill_value returned by maybe_promote. - exp_val_for_array : scalar - The expected missing value marker for the expected_dtype (which is - returned by maybe_promote when it receives an array). """ assert is_scalar(fill_value) - if boxed: - # in this case, we pass on fill_value wrapped in an array of specified - # box_dtype; the expected value returned from maybe_promote is the - # missing value marker for the returned dtype. - fill_array = np.array([fill_value], dtype=box_dtype) - result_dtype, result_fill_value = maybe_promote(dtype, fill_array) - expected_fill_value = exp_val_for_array - else: - # here, we pass on fill_value as a scalar directly; the expected value - # returned from maybe_promote is fill_value, potentially upcast to the - # returned dtype. - result_dtype, result_fill_value = maybe_promote(dtype, fill_value) - expected_fill_value = exp_val_for_scalar + # here, we pass on fill_value as a scalar directly; the expected value + # returned from maybe_promote is fill_value, potentially upcast to the + # returned dtype. + result_dtype, result_fill_value = maybe_promote(dtype, fill_value) + expected_fill_value = exp_val_for_scalar assert result_dtype == expected_dtype _assert_match(result_fill_value, expected_fill_value) @@ -280,41 +227,19 @@ def _assert_match(result_fill_value, expected_fill_value): ("uint64", np.iinfo("int64").min - 1, "object"), ], ) -def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype, box): +def test_maybe_promote_int_with_int(dtype, fill_value, expected_dtype): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) - boxed, box_dtype = box # read from parametrized fixture - - if boxed: - if expected_dtype != object: - pytest.xfail("falsely casts to object") - if box_dtype is None and ( - fill_value > np.iinfo("int64").max or np.iinfo("int64").min < fill_value < 0 - ): - pytest.xfail("falsely casts to float instead of object") # output is not a generic int, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - # no missing value marker for integers - exp_val_for_array = None if expected_dtype != "object" else np.nan - - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) - - -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(True, None), (False, None)]) -def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): + + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) + + +def test_maybe_promote_int_with_float(any_int_dtype, float_dtype): dtype = np.dtype(any_int_dtype) fill_dtype = np.dtype(float_dtype) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -323,26 +248,14 @@ def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): expected_dtype = np.float64 # fill_value can be different float type exp_val_for_scalar = np.float64(fill_value) - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(True, None), (False, None)]) -def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): +def test_maybe_promote_float_with_int(float_dtype, any_int_dtype): dtype = np.dtype(float_dtype) fill_dtype = np.dtype(any_int_dtype) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -352,17 +265,8 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): expected_dtype = dtype # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( @@ -390,49 +294,20 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): ("complex128", np.finfo("float32").max * (1.1 + 1j), "complex128"), ], ) -def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype, box): +def test_maybe_promote_float_with_float(dtype, fill_value, expected_dtype): dtype = np.dtype(dtype) expected_dtype = np.dtype(expected_dtype) - boxed, box_dtype = box # read from parametrized fixture - - if box_dtype == object: - pytest.xfail("falsely upcasts to object") - elif boxed and is_float_dtype(dtype) and is_complex_dtype(expected_dtype): - pytest.xfail("does not upcast to complex") - elif boxed and (dtype, expected_dtype) in [ - ("float32", "float64"), - ("float32", "complex64"), - ("complex64", "complex128"), - ]: - pytest.xfail("does not upcast correctly depending on value") # output is not a generic float, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): +def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced): dtype = np.dtype(bool) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture - - if boxed and fill_dtype == bool: - pytest.xfail("falsely upcasts to object") - if boxed and box_dtype is None and fill_dtype.kind == "M": - pytest.xfail("wrongly casts fill_value") - if boxed and box_dtype is None and fill_dtype.kind == "m": - pytest.xfail("wrongly casts fill_value") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -440,50 +315,25 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): # filling bool with anything but bool casts to object expected_dtype = np.dtype(object) if fill_dtype != bool else fill_dtype exp_val_for_scalar = fill_value - exp_val_for_array = np.nan if fill_dtype != bool else None - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): +def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced): dtype = np.dtype(any_numpy_dtype_reduced) fill_value = True - boxed, box_dtype = box # read from parametrized fixture - - if boxed and dtype == bool: - pytest.xfail("falsely upcasts to object") - if boxed and dtype not in (str, object) and box_dtype is None: - pytest.xfail("falsely upcasts to object") # filling anything but bool with bool casts to object expected_dtype = np.dtype(object) if dtype != bool else dtype # output is not a generic bool, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = np.nan if dtype != bool else None - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box): +def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced): dtype = np.dtype(bytes_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -491,78 +341,27 @@ def test_maybe_promote_bytes_with_any(bytes_dtype, any_numpy_dtype_reduced, box) # we never use bytes dtype internally, always promote to object expected_dtype = np.dtype(np.object_) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -# override parametrization of box to add special case for bytes -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with auto-dtype (fixed len) - (True, "bytes"), # fill_value wrapped in array with generic bytes-dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value directly - ], -) -def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype, box): +def test_maybe_promote_any_with_bytes(any_numpy_dtype_reduced, bytes_dtype): dtype = np.dtype(any_numpy_dtype_reduced) - fill_dtype = np.dtype(bytes_dtype) - boxed, box_dtype = box # read from parametrized fixture - - if not issubclass(dtype.type, np.bytes_): - if ( - boxed - and (box_dtype == "bytes" or box_dtype is None) - and not (is_string_dtype(dtype) or dtype == bool) - ): - pytest.xfail("does not upcast to object") # create array of given dtype fill_value = b"abc" - # special case for box_dtype (cannot use fixture in parametrization) - box_dtype = fill_dtype if box_dtype == "bytes" else box_dtype - # we never use bytes dtype internally, always promote to object expected_dtype = np.dtype(np.object_) # output is not a generic bytes, but corresponds to expected_dtype exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_datetime64_with_any( - datetime64_dtype, any_numpy_dtype_reduced, box -): +def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype_reduced): dtype = np.dtype(datetime64_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture - - if is_datetime64_dtype(fill_dtype): - if box_dtype == object: - pytest.xfail("falsely upcasts to object") - else: - if boxed and box_dtype is None: - pytest.xfail("does not upcast to object") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -572,34 +371,13 @@ def test_maybe_promote_datetime64_with_any( expected_dtype = dtype # for datetime dtypes, scalar values get cast to to_datetime64 exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() - exp_val_for_array = np.datetime64("NaT", "ns") else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -# override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype - # (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], -) @pytest.mark.parametrize( "fill_value", [ @@ -611,57 +389,27 @@ def test_maybe_promote_datetime64_with_any( ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], ) def test_maybe_promote_any_with_datetime64( - any_numpy_dtype_reduced, datetime64_dtype, fill_value, box + any_numpy_dtype_reduced, datetime64_dtype, fill_value ): dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture - - if is_datetime64_dtype(dtype): - if boxed and ( - box_dtype == object - or (box_dtype is None and not is_datetime64_dtype(type(fill_value))) - ): - pytest.xfail("falsely upcasts to object") - else: - if boxed and ( - box_dtype == "dt_dtype" - or (box_dtype is None and is_datetime64_dtype(type(fill_value))) - ): - pytest.xfail("mix of lack of upcasting, resp. wrong missing value") - - # special case for box_dtype - box_dtype = np.dtype(datetime64_dtype) if box_dtype == "dt_dtype" else box_dtype # filling datetime with anything but datetime casts to object if is_datetime64_dtype(dtype): expected_dtype = dtype # for datetime dtypes, scalar values get cast to pd.Timestamp.value exp_val_for_scalar = pd.Timestamp(fill_value).to_datetime64() - exp_val_for_array = np.datetime64("NaT", "ns") else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(True, object)]) def test_maybe_promote_datetimetz_with_any_numpy_dtype( - tz_aware_fixture, any_numpy_dtype_reduced, box + tz_aware_fixture, any_numpy_dtype_reduced ): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -669,34 +417,13 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( # filling datetimetz with any numpy dtype casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) - - -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(True, None), (True, object)]) -def test_maybe_promote_datetimetz_with_datetimetz( - tz_aware_fixture, tz_aware_fixture2, box -): - dtype = DatetimeTZDtype(tz=tz_aware_fixture) - fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) - boxed, box_dtype = box # read from parametrized fixture - from dateutil.tz import tzlocal + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) - if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") - if dtype.tz == fill_dtype.tz and boxed: - pytest.xfail("falsely upcasts") + +def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fixture2): + dtype = DatetimeTZDtype(tz=tz_aware_fixture) + fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -705,43 +432,21 @@ def test_maybe_promote_datetimetz_with_datetimetz( exp_val_for_scalar = fill_value if dtype.tz == fill_dtype.tz: expected_dtype = dtype - exp_val_for_array = NaT else: expected_dtype = np.dtype(object) - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize("fill_value", [None, np.nan, NaT]) -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(False, None)]) -def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): +def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value): dtype = DatetimeTZDtype(tz=tz_aware_fixture) - boxed, box_dtype = box # read from parametrized fixture expected_dtype = dtype exp_val_for_scalar = NaT - exp_val_for_array = NaT - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( @@ -755,43 +460,23 @@ def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], ) def test_maybe_promote_any_numpy_dtype_with_datetimetz( - any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box + any_numpy_dtype_reduced, tz_aware_fixture, fill_value ): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) - boxed, box_dtype = box # read from parametrized fixture fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] # filling any numpy dtype with datetimetz casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_timedelta64_with_any( - timedelta64_dtype, any_numpy_dtype_reduced, box -): +def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype_reduced): dtype = np.dtype(timedelta64_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture - - if is_timedelta64_dtype(fill_dtype): - if box_dtype == object: - pytest.xfail("falsely upcasts to object") - else: - if boxed and box_dtype is None: - pytest.xfail("does not upcast to object") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -801,21 +486,11 @@ def test_maybe_promote_timedelta64_with_any( expected_dtype = dtype # for timedelta dtypes, scalar values get cast to pd.Timedelta.value exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() - exp_val_for_array = np.timedelta64("NaT", "ns") else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize( @@ -823,62 +498,26 @@ def test_maybe_promote_timedelta64_with_any( [pd.Timedelta(days=1), np.timedelta64(24, "h"), datetime.timedelta(1)], ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], ) -# override parametrization of box to add special case for td_dtype -@pytest.mark.parametrize( - "box", - [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], -) def test_maybe_promote_any_with_timedelta64( - any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box + any_numpy_dtype_reduced, timedelta64_dtype, fill_value ): dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture - - if is_timedelta64_dtype(dtype): - if boxed and ( - box_dtype == object - or (box_dtype is None and not is_timedelta64_dtype(type(fill_value))) - ): - pytest.xfail("falsely upcasts to object") - else: - if boxed and box_dtype is None and is_timedelta64_dtype(type(fill_value)): - pytest.xfail("does not upcast correctly") - - # special case for box_dtype - box_dtype = np.dtype(timedelta64_dtype) if box_dtype == "td_dtype" else box_dtype # filling anything but timedelta with timedelta casts to object if is_timedelta64_dtype(dtype): expected_dtype = dtype # for timedelta dtypes, scalar values get cast to pd.Timedelta.value exp_val_for_scalar = pd.Timedelta(fill_value).to_timedelta64() - exp_val_for_array = np.timedelta64("NaT", "ns") else: expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, box): +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced): dtype = np.dtype(string_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -886,61 +525,26 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, bo # filling string with anything casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -# override parametrization of box to add special case for str -@pytest.mark.parametrize( - "box", - [ - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, None), # fill_value wrapped in array with default dtype - # (True, 'str'), # fill_value wrapped in array with generic string-dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None), # fill_value passed on as scalar - ], -) -def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype): dtype = np.dtype(any_numpy_dtype_reduced) - fill_dtype = np.dtype(string_dtype) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype fill_value = "abc" - # special case for box_dtype (cannot use fixture in parametrization) - box_dtype = fill_dtype if box_dtype == "str" else box_dtype - # filling anything with a string casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced, box): +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced): dtype = np.dtype(object_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -948,22 +552,12 @@ def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced, bo # filling object with anything stays object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, box): +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype): dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture # create array of object dtype from a scalar value (i.e. passing # dtypes.common.is_scalar), which can however not be cast to int/float etc. @@ -972,27 +566,13 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, bo # filling object with anything stays object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value - exp_val_for_array = np.nan - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize("fill_value", [None, np.nan, NaT]) -# override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize("box", [(False, None)]) -def test_maybe_promote_any_numpy_dtype_with_na( - any_numpy_dtype_reduced, fill_value, box -): +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value): dtype = np.dtype(any_numpy_dtype_reduced) - boxed, box_dtype = box # read from parametrized fixture if is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float @@ -1020,24 +600,7 @@ def test_maybe_promote_any_numpy_dtype_with_na( expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan - # array case has same expected_dtype; but returns corresponding na-marker - if is_integer_dtype(expected_dtype): - # integers cannot hold NaNs; maybe_promote_with_array returns None - exp_val_for_array = None - elif is_datetime_or_timedelta_dtype(expected_dtype): - exp_val_for_array = expected_dtype.type("NaT", "ns") - else: # expected_dtype = float / complex / object - exp_val_for_array = np.nan - - _check_promote( - dtype, - fill_value, - boxed, - box_dtype, - expected_dtype, - exp_val_for_scalar, - exp_val_for_array, - ) + _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) @pytest.mark.parametrize("dim", [0, 2, 3]) @@ -1051,12 +614,18 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): for _ in range(dim): fill_array = np.expand_dims(fill_array, 0) - # test against 1-dimensional case - expected_dtype, expected_missing_value = maybe_promote( - dtype, np.array([1], dtype=dtype) - ) + if dtype != object: + # test against 1-dimensional case + with pytest.raises(ValueError, match="fill_value must be a scalar"): + maybe_promote(dtype, np.array([1], dtype=dtype)) - result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + with pytest.raises(ValueError, match="fill_value must be a scalar"): + maybe_promote(dtype, fill_array) - assert result_dtype == expected_dtype - _assert_match(result_missing_value, expected_missing_value) + else: + expected_dtype, expected_missing_value = maybe_promote( + dtype, np.array([1], dtype=dtype) + ) + result_dtype, result_missing_value = maybe_promote(dtype, fill_array) + assert result_dtype == expected_dtype + _assert_match(result_missing_value, expected_missing_value) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index b22ed0bcd0a11..bb7a7d059c7ee 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -4,12 +4,12 @@ from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) def test_upcast_error(result): - # GH23823 + # GH23823 require result arg to be ndarray mask = np.array([False, True, False]) other = np.array([61, 62, 63]) with pytest.raises(ValueError): @@ -17,76 +17,55 @@ def test_upcast_error(result): @pytest.mark.parametrize( - "arr, other, exp_changed, expected", + "arr, other", [ - (np.arange(1, 6), np.array([61, 62, 63]), False, np.array([1, 61, 3, 62, 63])), + (np.arange(1, 6), np.array([61, 62, 63])), + (np.arange(1, 6), np.array([61.1, 62.2, 63.3])), + (np.arange(10, 15), np.array([61, 62])), + (np.arange(10, 15), np.array([61, np.nan])), ( - np.arange(1, 6), - np.array([61.1, 62.2, 63.3]), - True, - np.array([1, 61.1, 3, 62.2, 63.3]), + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), ), - (np.arange(1, 6), np.nan, True, np.array([1, np.nan, 3, np.nan, np.nan])), - (np.arange(10, 15), np.array([61, 62]), False, np.array([10, 61, 12, 62, 61])), ( - np.arange(10, 15), - np.array([61, np.nan]), - True, - np.array([10, 61, 12, np.nan, 61]), + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), ), ], ) -def test_upcast(arr, other, exp_changed, expected): +def test_upcast_scalar_other(arr, other): + # for now we do not support non-scalar `other` + mask = np.array([False, True, False, True, True]) + with pytest.raises(ValueError, match="other must be a scalar"): + maybe_upcast_putmask(arr, mask, other) + + +def test_upcast(): # GH23823 + arr = np.arange(1, 6) mask = np.array([False, True, False, True, True]) - result, changed = maybe_upcast_putmask(arr, mask, other) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) - assert changed == exp_changed + expected = np.array([1, np.nan, 3, np.nan, np.nan]) + assert changed tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize( - "arr, other, exp_changed, expected", - [ - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), - False, - np.array( - ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-03"], - dtype="datetime64[D]", - ), - ), - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.nan, - False, - np.array( - [ - "2019-01-01", - np.datetime64("NaT"), - "2019-01-03", - np.datetime64("NaT"), - np.datetime64("NaT"), - ], - dtype="datetime64[D]", - ), - ), - ( - np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), - np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), - False, - np.array( - ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-01"], - dtype="datetime64[D]", - ), - ), - ], -) -def test_upcast_datetime(arr, other, exp_changed, expected): +def test_upcast_datetime(): # GH23823 + arr = np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]") mask = np.array([False, True, False, True, True]) - result, changed = maybe_upcast_putmask(arr, mask, other) + result, changed = maybe_upcast_putmask(arr, mask, other=np.nan) - assert changed == exp_changed + expected = np.array( + [ + "2019-01-01", + np.datetime64("NaT"), + "2019-01-03", + np.datetime64("NaT"), + np.datetime64("NaT"), + ], + dtype="datetime64[D]", + ) + assert not changed tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 466b724f98770..ce925891f62c0 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,8 +1,12 @@ +from datetime import datetime +from typing import List + import numpy as np import pytest import pandas.util._test_decorators as td +from pandas.core.dtypes.cast import astype_nansafe import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -11,8 +15,11 @@ IntervalDtype, PeriodDtype, ) +from pandas.core.dtypes.missing import isna import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -21,7 +28,6 @@ UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES, ) -import pandas.util.testing as tm # EA & Actual Dtypes @@ -177,7 +183,7 @@ def test_is_object(): "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) def test_is_sparse(check_scipy): - assert com.is_sparse(pd.SparseArray([1, 2, 3])) + assert com.is_sparse(SparseArray([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -193,7 +199,7 @@ def test_is_scipy_sparse(): assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) - assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) + assert not com.is_scipy_sparse(SparseArray([1, 2, 3])) def test_is_categorical(): @@ -205,25 +211,6 @@ def test_is_categorical(): assert not com.is_categorical([1, 2, 3]) -def test_is_datetimetz(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_datetimetz([1, 2, 3]) - assert not com.is_datetimetz(pd.DatetimeIndex([1, 2, 3])) - - assert com.is_datetimetz(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern")) - - dtype = DatetimeTZDtype("ns", tz="US/Eastern") - s = pd.Series([], dtype=dtype) - assert com.is_datetimetz(s) - - -def test_is_period_deprecated(): - with tm.assert_produces_warning(FutureWarning): - assert not com.is_period([1, 2, 3]) - assert not com.is_period(pd.Index([1, 2, 3])) - assert com.is_period(pd.PeriodIndex(["2017-01-01"], freq="D")) - - def test_is_datetime64_dtype(): assert not com.is_datetime64_dtype(object) assert not com.is_datetime64_dtype([1, 2, 3]) @@ -307,24 +294,13 @@ def test_is_datetime_arraylike(): assert com.is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) -def test_is_datetimelike(): - assert not com.is_datetimelike([1, 2, 3]) - assert not com.is_datetimelike(pd.Index([1, 2, 3])) - - assert com.is_datetimelike(pd.DatetimeIndex([1, 2, 3])) - assert com.is_datetimelike(pd.PeriodIndex([], freq="A")) - assert com.is_datetimelike(np.array([], dtype=np.datetime64)) - assert com.is_datetimelike(pd.Series([], dtype="timedelta64[ns]")) - assert com.is_datetimelike(pd.DatetimeIndex(["2000"], tz="US/Eastern")) - - dtype = DatetimeTZDtype("ns", tz="US/Eastern") - s = pd.Series([], dtype=dtype) - assert com.is_datetimelike(s) +integer_dtypes: List = [] @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2])] + integer_dtypes + + [pd.Series([1, 2])] + ALL_INT_DTYPES + to_numpy_dtypes(ALL_INT_DTYPES) + ALL_EA_INT_DTYPES @@ -350,9 +326,13 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) +signed_integer_dtypes: List = [] + + @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2])] + signed_integer_dtypes + + [pd.Series([1, 2])] + SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + SIGNED_EA_INT_DTYPES @@ -382,9 +362,13 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) +unsigned_integer_dtypes: List = [] + + @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2], dtype=np.uint32)] + unsigned_integer_dtypes + + [pd.Series([1, 2], dtype=np.uint32)] + UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + UNSIGNED_EA_INT_DTYPES @@ -508,7 +492,7 @@ def test_is_numeric_v_string_like(): def test_is_datetimelike_v_numeric(): - dt = np.datetime64(pd.datetime(2017, 1, 1)) + dt = np.datetime64(datetime(2017, 1, 1)) assert not com.is_datetimelike_v_numeric(1, 1) assert not com.is_datetimelike_v_numeric(dt, dt) @@ -577,7 +561,11 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) + assert com.is_bool_dtype(pd.BooleanDtype()) + assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + +@pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) @@ -589,7 +577,7 @@ def test_is_extension_type(check_scipy): cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) - assert com.is_extension_type(pd.SparseArray([1, 2, 3])) + assert com.is_extension_type(SparseArray([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -602,6 +590,35 @@ def test_is_extension_type(check_scipy): assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) +def test_is_extension_type_deprecation(): + with tm.assert_produces_warning(FutureWarning): + com.is_extension_type([1, 2, 3]) + + +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) +def test_is_extension_array_dtype(check_scipy): + assert not com.is_extension_array_dtype([1, 2, 3]) + assert not com.is_extension_array_dtype(np.array([1, 2, 3])) + assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3])) + + cat = pd.Categorical([1, 2, 3]) + assert com.is_extension_array_dtype(cat) + assert com.is_extension_array_dtype(pd.Series(cat)) + assert com.is_extension_array_dtype(SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) + + dtype = DatetimeTZDtype("ns", tz="US/Eastern") + s = pd.Series([], dtype=dtype) + assert com.is_extension_array_dtype(s) + + if check_scipy: + import scipy.sparse + + assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3])) + + def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) @@ -612,18 +629,6 @@ def test_is_complex_dtype(): assert com.is_complex_dtype(np.array([1 + 1j, 5])) -def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) - assert com.is_offsetlike(pd.offsets.MonthEnd()) - assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) - - assert not com.is_offsetlike(pd.Timedelta(1)) - assert not com.is_offsetlike(np.array([1 + 1j, 5])) - - # mixed case - assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) - - @pytest.mark.parametrize( "input_param,result", [ @@ -708,3 +713,42 @@ def test__get_dtype_fails(input_param, expected_error_message): ) def test__is_dtype_type(input_param, result): assert com._is_dtype_type(input_param, lambda tipo: tipo == result) + + +@pytest.mark.parametrize("val", [np.datetime64("NaT"), np.timedelta64("NaT")]) +@pytest.mark.parametrize("typ", [np.int64]) +def test_astype_nansafe(val, typ): + arr = np.array([val]) + + msg = "Cannot convert NaT values to integer" + with pytest.raises(ValueError, match=msg): + astype_nansafe(arr, dtype=typ) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +@pytest.mark.parametrize( + "to_type", + [ + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int32, + np.float16, + np.float32, + ], +) +def test_astype_datetime64_bad_dtype_raises(from_type, to_type): + arr = np.array([from_type("2018")]) + + with pytest.raises(TypeError, match="cannot astype"): + astype_nansafe(arr, dtype=to_type) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +def test_astype_object_preserves_datetime_na(from_type): + arr = np.array([from_type("NaT")]) + result = astype_nansafe(arr, dtype="object") + + assert isna(result)[0] diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 0ca2f7c976535..02daa185b1cdb 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -2,7 +2,7 @@ import pandas.core.dtypes.concat as _concat -from pandas import DatetimeIndex, Index, Period, PeriodIndex, Series, TimedeltaIndex +from pandas import DatetimeIndex, Period, PeriodIndex, Series, TimedeltaIndex @pytest.mark.parametrize( @@ -40,9 +40,8 @@ ), ], ) -@pytest.mark.parametrize("klass", [Index, Series]) -def test_get_dtype_kinds(klass, to_concat, expected): - to_concat_klass = [klass(c) for c in to_concat] +def test_get_dtype_kinds(index_or_series, to_concat, expected): + to_concat_klass = [index_or_series(c) for c in to_concat] result = _concat.get_dtype_kinds(to_concat_klass) assert result == set(expected) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index f4bf4c1fc83d9..fddd6239df309 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -12,10 +12,8 @@ is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dtype_equal, is_interval_dtype, - is_period, is_period_dtype, is_string_dtype, ) @@ -24,14 +22,13 @@ DatetimeTZDtype, IntervalDtype, PeriodDtype, - ordered_sentinel, registry, ) import pandas as pd from pandas import Categorical, CategoricalIndex, IntervalIndex, Series, date_range -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class Base: @@ -67,8 +64,7 @@ def test_pickle(self): class TestCategoricalDtype(Base): def create(self): - # TODO(GH 26403): Remove when default ordered becomes False - return CategoricalDtype(ordered=None) + return CategoricalDtype() def test_pickle(self): # make sure our cache is NOT pickled @@ -189,7 +185,7 @@ def create(self): def test_alias_to_unit_raises(self): # 23990 - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(ValueError, match="Passing a dtype alias"): DatetimeTZDtype("datetime64[ns, US/Central]") def test_alias_to_unit_bad_alias_raises(self): @@ -240,7 +236,7 @@ def test_compat(self): def test_construction_from_string(self): result = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]") assert is_dtype_equal(self.dtype, result) - msg = "Could not construct DatetimeTZDtype from 'foo'" + msg = "Cannot construct a 'DatetimeTZDtype' from 'foo'" with pytest.raises(TypeError, match=msg): DatetimeTZDtype.construct_from_string("foo") @@ -248,7 +244,7 @@ def test_construct_from_string_raises(self): with pytest.raises(TypeError, match="notatz"): DatetimeTZDtype.construct_from_string("datetime64[ns, notatz]") - msg = "^Could not construct DatetimeTZDtype" + msg = "^Cannot construct a 'DatetimeTZDtype'" with pytest.raises(TypeError, match=msg): # list instead of string DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"]) @@ -294,32 +290,22 @@ def test_basic(self): assert not is_datetime64tz_dtype(np.dtype("float64")) assert not is_datetime64tz_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s) - assert is_datetimetz(s.dtype) - assert not is_datetimetz(np.dtype("float64")) - assert not is_datetimetz(1.0) - def test_dst(self): dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern") s1 = Series(dr1, name="A") assert is_datetime64tz_dtype(s1) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s1) dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern") s2 = Series(dr2, name="A") assert is_datetime64tz_dtype(s2) - with tm.assert_produces_warning(FutureWarning): - assert is_datetimetz(s2) assert s1.dtype == s2.dtype @pytest.mark.parametrize("tz", ["UTC", "US/Eastern"]) @pytest.mark.parametrize("constructor", ["M8", "datetime64"]) def test_parser(self, tz, constructor): # pr #11245 - dtz_str = "{con}[ns, {tz}]".format(con=constructor, tz=tz) + dtz_str = f"{constructor}[ns, {tz}]" result = DatetimeTZDtype.construct_from_string(dtz_str) expected = DatetimeTZDtype("ns", tz) assert result == expected @@ -422,6 +408,9 @@ def test_construction_from_string(self): with pytest.raises(TypeError): PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") + with pytest.raises(TypeError, match="list"): + PeriodDtype.construct_from_string([1, 2, 3]) + def test_is_dtype(self): assert PeriodDtype.is_dtype(self.dtype) assert PeriodDtype.is_dtype("period[D]") @@ -457,22 +446,14 @@ def test_basic(self): assert is_period_dtype(pidx.dtype) assert is_period_dtype(pidx) - with tm.assert_produces_warning(FutureWarning): - assert is_period(pidx) s = Series(pidx, name="A") assert is_period_dtype(s.dtype) assert is_period_dtype(s) - with tm.assert_produces_warning(FutureWarning): - assert is_period(s) assert not is_period_dtype(np.dtype("float64")) assert not is_period_dtype(1.0) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(np.dtype("float64")) - with tm.assert_produces_warning(FutureWarning): - assert not is_period(1.0) def test_empty(self): dt = PeriodDtype() @@ -657,7 +638,7 @@ def test_equality_generic(self, subtype): def test_name_repr(self, subtype): # GH 18980 dtype = IntervalDtype(subtype) - expected = "interval[{subtype}]".format(subtype=subtype) + expected = f"interval[{subtype}]" assert str(dtype) == expected assert dtype.name == "interval" @@ -707,6 +688,10 @@ def test_caching(self): tm.round_trip_pickle(dtype) assert len(IntervalDtype._cache) == 0 + def test_not_string(self): + # GH30568: though IntervalDtype has object kind, it cannot be string + assert not is_string_dtype(IntervalDtype()) + class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( @@ -741,8 +726,7 @@ def test_unordered_same(self, ordered): def test_categories(self): result = CategoricalDtype(["a", "b", "c"]) tm.assert_index_equal(result.categories, pd.Index(["a", "b", "c"])) - with tm.assert_produces_warning(FutureWarning): - assert result.ordered is None + assert result.ordered is False def test_equal_but_different(self, ordered_fixture): c1 = CategoricalDtype([1, 2, 3]) @@ -867,25 +851,15 @@ def test_categorical_categories(self): @pytest.mark.parametrize( "new_categories", [list("abc"), list("cba"), list("wxyz"), None] ) - @pytest.mark.parametrize("new_ordered", [True, False, None, ordered_sentinel]) + @pytest.mark.parametrize("new_ordered", [True, False, None]) def test_update_dtype(self, ordered_fixture, new_categories, new_ordered): - dtype = CategoricalDtype(list("abc"), ordered_fixture) + original_categories = list("abc") + dtype = CategoricalDtype(original_categories, ordered_fixture) new_dtype = CategoricalDtype(new_categories, new_ordered) - expected_categories = new_dtype.categories - if expected_categories is None: - expected_categories = dtype.categories - - expected_ordered = new_ordered - if new_ordered is ordered_sentinel or new_ordered is None: - expected_ordered = dtype.ordered - - # GH 26336 - if new_ordered is ordered_sentinel and ordered_fixture is True: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = dtype.update_dtype(new_dtype) - else: - result = dtype.update_dtype(new_dtype) + result = dtype.update_dtype(new_dtype) + expected_categories = pd.Index(new_categories or original_categories) + expected_ordered = new_ordered if new_ordered is not None else dtype.ordered tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered @@ -905,27 +879,6 @@ def test_update_dtype_errors(self, bad_dtype): with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype) - @pytest.mark.parametrize("ordered", [ordered_sentinel, None, True, False]) - def test_ordered_none_default_deprecated(self, ordered): - # GH 26403: CDT.ordered only warns if ordered is not explicitly passed - dtype = CategoricalDtype(list("abc"), ordered=ordered) - warning = FutureWarning if ordered is ordered_sentinel else None - with tm.assert_produces_warning(warning): - dtype.ordered - - @pytest.mark.parametrize("ordered", [True, False, None, ordered_sentinel]) - def test_pickle_ordered_from_sentinel(self, ordered): - # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) - dtype = CategoricalDtype(categories=list("abc"), ordered=ordered) - - warning = FutureWarning if ordered is ordered_sentinel else None - with tm.assert_produces_warning(warning, check_stacklevel=False): - dtype_from_pickle = tm.round_trip_pickle(dtype) - - result = dtype_from_pickle._ordered_from_sentinel - expected = ordered is ordered_sentinel - assert result is expected - @pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype] @@ -961,7 +914,7 @@ def test_registry_find(dtype, expected): (pd.Series([1, 2]), False), (np.array([True, False]), True), (pd.Series([True, False]), True), - (pd.SparseArray([True, False]), True), + (SparseArray([True, False]), True), (SparseDtype(bool), True), ], ) @@ -971,7 +924,7 @@ def test_is_bool_dtype(dtype, expected): def test_is_bool_dtype_sparse(): - result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) + result = is_bool_dtype(pd.Series(SparseArray([True, False]))) assert result is True diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index c17a8997a9b8f..2c8631ac2d71d 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -5,7 +5,7 @@ from pandas.core.dtypes import generic as gt import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestABCClasses: @@ -17,7 +17,7 @@ class TestABCClasses: categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) - sparse_array = pd.SparseArray(np.random.randn(10)) + sparse_array = pd.arrays.SparseArray(np.random.randn(10)) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 62fb118f719e3..d022b0e97877a 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -4,6 +4,7 @@ """ import collections +from collections import namedtuple from datetime import date, datetime, time, timedelta from decimal import Decimal from fractions import Fraction @@ -51,7 +52,8 @@ Timestamp, isna, ) -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import IntegerArray @pytest.fixture(params=[True, False], ids=str) @@ -77,7 +79,7 @@ def coerce(request): ((x for x in [1, 2]), True, "generator"), ((_ for _ in []), True, "generator-empty"), (Series([1]), True, "Series"), - (Series([]), True, "Series-empty"), + (Series([], dtype=object), True, "Series-empty"), (Series(["a"]).str, True, "StringMethods"), (Series([], dtype="O").str, True, "StringMethods-empty"), (Index([1]), True, "Index"), @@ -138,7 +140,7 @@ def __getitem__(self): def test_is_array_like(): - assert inference.is_array_like(Series([])) + assert inference.is_array_like(Series([], dtype=object)) assert inference.is_array_like(Series([1, 2])) assert inference.is_array_like(np.array(["a", "b"])) assert inference.is_array_like(Index(["2016-01-01"])) @@ -164,7 +166,7 @@ class DtypeList(list): {"a": 1}, {1, "a"}, Series([1]), - Series([]), + Series([], dtype=object), Series(["a"]).str, (x for x in range(5)), ], @@ -238,7 +240,7 @@ def __getitem__(self, key): if has_contains: - def __contains__(self, key): + def __contains__(self, key) -> bool: return self.d.__contains__(key) d = DictLike({1: 2}) @@ -448,7 +450,7 @@ def test_scientific_no_exponent(self): def test_convert_non_hashable(self): # GH13324 # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, "apple"]) + arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object) result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) @@ -505,7 +507,7 @@ def test_convert_numeric_int64_uint64(self, case, coerce): result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("value", [-2 ** 63 - 1, 2 ** 64]) + @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 arr = np.array([value], dtype=object) @@ -552,6 +554,20 @@ def test_maybe_convert_objects_datetime(self): out = lib.maybe_convert_objects(arr, convert_datetime=1, convert_timedelta=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize( + "exp", + [ + IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])), + IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])), + ], + ) + def test_maybe_convert_objects_nullable_integer(self, exp): + # GH27335 + arr = np.array([2, np.NaN], dtype=object) + result = lib.maybe_convert_objects(arr, convert_to_nullable_integer=1) + + tm.assert_extension_array_equal(result, exp) + def test_mixed_dtypes_remain_object_array(self): # GH14956 array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) @@ -613,13 +629,13 @@ def test_integer_na(self, arr, skipna): expected = "integer" if skipna else "integer-na" assert result == expected - def test_deprecation(self): - # GH 24050 - arr = np.array([1, 2, 3], dtype=object) + def test_infer_dtype_skipna_default(self): + # infer_dtype `skipna` default deprecated in GH#24050, + # changed to True in GH#29876 + arr = np.array([1, 2, 3, np.nan], dtype=object) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = lib.infer_dtype(arr) # default: skipna=None -> warn - assert result == "integer" + result = lib.infer_dtype(arr) + assert result == "integer" def test_bools(self): arr = np.array([True, False, True, True, True], dtype="O") @@ -717,12 +733,17 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" assert result == "mixed" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = "string" - assert result == expected + assert result == "string" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" @pytest.mark.parametrize( "dtype, missing, skipna, expected", @@ -1103,18 +1124,13 @@ def test_is_string_array(self): def test_to_object_array_tuples(self): r = (5, 6) values = [r] - result = lib.to_object_array_tuples(values) + lib.to_object_array_tuples(values) - try: - # make sure record array works - from collections import namedtuple - - record = namedtuple("record", "x y") - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass + # make sure record array works + record = namedtuple("record", "x y") + r = record(5, 6) + values = [r] + lib.to_object_array_tuples(values) def test_object(self): @@ -1154,8 +1170,6 @@ def test_is_period(self): def test_categorical(self): # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list("abc")) result = lib.infer_dtype(arr, skipna=True) assert result == "categorical" @@ -1300,7 +1314,7 @@ def test_is_datetime_dtypes(self): assert is_datetime64tz_dtype(tsa) for tz in ["US/Eastern", "UTC"]: - dtype = "datetime64[ns, {}]".format(tz) + dtype = f"datetime64[ns, {tz}]" assert not is_datetime64_dtype(dtype) assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) @@ -1384,7 +1398,7 @@ def test_is_scalar_pandas_scalars(self): assert is_scalar(DateOffset(days=1)) def test_is_scalar_pandas_containers(self): - assert not is_scalar(Series()) + assert not is_scalar(Series(dtype=object)) assert not is_scalar(Series([1])) assert not is_scalar(DataFrame()) assert not is_scalar(DataFrame([[1]])) @@ -1394,7 +1408,7 @@ def test_is_scalar_pandas_containers(self): def test_datetimeindex_from_empty_datetime64_array(): for unit in ["ms", "us", "ns"]: - idx = DatetimeIndex(np.array([], dtype="datetime64[{unit}]".format(unit=unit))) + idx = DatetimeIndex(np.array([], dtype=f"datetime64[{unit}]")) assert len(idx) == 0 diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 89474cf8fa953..7ba59786bb0fa 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,6 +1,5 @@ from datetime import datetime from decimal import Decimal -from warnings import catch_warnings, filterwarnings import numpy as np import pytest @@ -23,7 +22,7 @@ import pandas as pd from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm now = pd.Timestamp.now() utcnow = pd.Timestamp.now("UTC") @@ -90,7 +89,8 @@ def test_isna_isnull(self, isna_f): assert not isna_f(-np.inf) # type - assert not isna_f(type(pd.Series())) + assert not isna_f(type(pd.Series(dtype=object))) + assert not isna_f(type(pd.Series(dtype=np.float64))) assert not isna_f(type(pd.DataFrame())) # series @@ -294,6 +294,11 @@ def test_array_equivalent(): np.array([np.nan, None], dtype="object"), np.array([np.nan, None], dtype="object"), ) + # Check the handling of nested arrays in array_equivalent_object + assert array_equivalent( + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + ) assert array_equivalent( np.array([np.nan, 1 + 1j], dtype="complex"), np.array([np.nan, 1 + 1j], dtype="complex"), @@ -314,23 +319,21 @@ def test_array_equivalent(): assert not array_equivalent( TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) ) - with catch_warnings(): - filterwarnings("ignore", "Converting timezone", FutureWarning) - assert array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([1, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="CET"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 6a28f76e474cc..b0e5a6f85feeb 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -33,10 +33,17 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @classmethod def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ return ArrowBoolArray def _is_boolean(self): @@ -56,10 +63,17 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + raise TypeError(f"Cannot construct a '{cls}' from '{string}'") @classmethod def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ return ArrowStringArray @@ -79,7 +93,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return cls.from_scalars(scalars) def __repr__(self): - return "{cls}({data})".format(cls=type(self).__name__, data=repr(self._data)) + return f"{type(self).__name__}({repr(self._data)})" def __getitem__(self, item): if pd.api.types.is_scalar(item): diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 9c53210b75d6b..94dd09d3eb053 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -2,10 +2,10 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm -pytest.importorskip("pyarrow", minversion="0.10.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index 06f149aa4b75f..abd5c1f386dc5 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,7 +2,7 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.10.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowStringDtype # isort:skip diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index 2f808d20acd31..144b0825b39a2 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class BaseExtensionTests: diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7146443bf8de5..58859fc6ac54c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd from pandas.core.internals import ObjectBlock @@ -21,3 +23,12 @@ def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 7262a85b1fe00..c40646ca2415e 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -64,6 +64,15 @@ def test_from_dtype(self, data): result = pd.Series(list(data), dtype=str(dtype)) self.assert_series_equal(result, expected) + # gh-30280 + + expected = pd.DataFrame(data).astype(dtype) + result = pd.DataFrame(list(data), dtype=dtype) + self.assert_frame_equal(result, expected) + + result = pd.DataFrame(list(data), dtype=str(dtype)) + self.assert_frame_equal(result, expected) + def test_pandas_array(self, data): # pd.array(extension_array) should be idempotent... result = pd.array(data) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index a5040c8cfc2fc..b6c12b5844086 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -16,8 +16,7 @@ def test_name(self, dtype): def test_kind(self, dtype): valid = set("biufcmMOSUV") - if dtype.kind is not None: - assert dtype.kind in valid + assert dtype.kind in valid def test_construct_from_string_own_name(self, dtype): result = dtype.construct_from_string(dtype.name) @@ -38,6 +37,9 @@ def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + def test_is_not_string_type(self, dtype): return not pd.api.types.is_string_dtype(dtype) @@ -96,7 +98,10 @@ def test_eq(self, dtype): assert dtype != "anonther_type" def test_construct_from_string(self, dtype): - dtype_instance = dtype.__class__.construct_from_string(dtype.name) - assert isinstance(dtype_instance, dtype.__class__) - with pytest.raises(TypeError): - dtype.__class__.construct_from_string("another_type") + dtype_instance = type(dtype).construct_from_string(dtype.name) + assert isinstance(dtype_instance, type(dtype)) + + def test_construct_from_string_another_type_raises(self, dtype): + msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" + with pytest.raises(TypeError, match=msg): + type(dtype).construct_from_string("another_type") diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index d56cc50f4739c..dc1f62c4c97c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,45 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolean_array_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + with pytest.raises(ValueError): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError): + s[mask] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty @@ -266,3 +305,16 @@ def test_loc_len1(self, data): df = pd.DataFrame({"A": data}) res = df.loc[[0], "A"] assert res._data._block.ndim == 1 + + def test_item(self, data): + # https://github.com/pandas-dev/pandas/pull/30175 + s = pd.Series(data) + result = s[:1].item() + assert result == data[0] + + msg = "can only convert an array of size 1 to a Python scalar" + with pytest.raises(ValueError, match=msg): + s[:0].item() + + with pytest.raises(ValueError, match=msg): + s.item() diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index dc926d2ff6ab4..94d0ef7bbea84 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index a29f6deeffae6..cdea96334be2a 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6d47b0c1d1f77..1e427c6319cab 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.sorting import nargsort -import pandas.util.testing as tm from .base import BaseExtensionTests @@ -113,29 +113,29 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - expected_labels = np.array( + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_codes = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp ) expected_uniques = data_for_grouping.take([0, 4, 7]) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) - tm.assert_numpy_array_equal(l1, l2) - self.assert_extension_array_equal(u1, u2) + tm.assert_numpy_array_equal(codes_1, codes_2) + self.assert_extension_array_equal(uniques_1, uniques_2) def test_factorize_empty(self, data): - labels, uniques = pd.factorize(data[:0]) - expected_labels = np.array([], dtype=np.intp) + codes, uniques = pd.factorize(data[:0]) + expected_codes = np.array([], dtype=np.intp) expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 21bbb365ab0f3..2393d2edcd2c6 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index e968962caf0b7..20d06ef2e5647 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -62,10 +62,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc = TypeError # type: Optional[Type[TypeError]] - frame_scalar_exc = TypeError # type: Optional[Type[TypeError]] - series_array_exc = TypeError # type: Optional[Type[TypeError]] - divmod_exc = TypeError # type: Optional[Type[TypeError]] + series_scalar_exc: Optional[Type[TypeError]] = TypeError + frame_scalar_exc: Optional[Type[TypeError]] = TypeError + series_array_exc: Optional[Type[TypeError]] = TypeError + divmod_exc: Optional[Type[TypeError]] = TypeError def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar @@ -123,9 +123,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): result = data.__add__(other) assert result is NotImplemented else: - raise pytest.skip( - "{} does not implement add".format(data.__class__.__name__) - ) + raise pytest.skip(f"{type(data).__name__} does not implement add") class BaseComparisonOpsTests(BaseOpsUtil): @@ -169,6 +167,4 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): result = data.__eq__(other) assert result is NotImplemented else: - raise pytest.skip( - "{} does not implement __eq__".format(data.__class__.__name__) - ) + raise pytest.skip(f"{type(data).__name__} does not implement __eq__") diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 0f10efbf32a49..ad34a83c7cf71 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -18,8 +18,8 @@ def test_array_repr(self, data, size): data = type(data)._concat_same_type([data] * 5) result = repr(data) - assert data.__class__.__name__ in result - assert "Length: {}".format(len(data)) in result + assert type(data).__name__ in result + assert f"Length: {len(data)}" in result assert str(data.dtype) in result if size == "big": assert "..." in result diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 8766bb771f8a2..6f433d659575a 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 90e607343297d..ec21898852888 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -94,6 +94,19 @@ def test_concat_columns(self, data, na_value): result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) + def test_concat_extension_arrays_copy_false(self, data, na_value): + # GH 20756 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": data[3:7]}) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": data[3:7], + } + ) + result = pd.concat([df1, df2], axis=1, copy=False) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] @@ -295,3 +308,19 @@ def test_ravel(self, data): # Check that we have a view, not a copy result[0] = result[1] assert data[0] == data[1] + + def test_transpose(self, data): + df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"]) + result = df.T + expected = pd.DataFrame( + { + "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype), + "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype), + "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype), + "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype), + }, + index=["A", "B"], + ) + self.assert_frame_equal(result, expected) + self.assert_frame_equal(np.transpose(np.transpose(df)), df) + self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bb6bb02b462e2..0bb8aede6298c 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -186,3 +186,12 @@ def test_setitem_scalar_key_sequence_raise(self, data): arr = data[:5].copy() with pytest.raises(ValueError): arr[0] = arr[[0, 1]] + + def test_setitem_preserves_views(self, data): + # GH#28150 setitem shouldn't swap the underlying data + view1 = data.view() + view2 = data[:] + + data[0] = data[1] + assert view1[0] == data[1] + assert view2[0] == data[1] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index a1988744d76a1..85bd5f7a33fe1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd -from pandas.api.extensions import register_extension_dtype +from pandas.api.extensions import no_default, register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin @@ -22,12 +22,13 @@ class DecimalDtype(ExtensionDtype): def __init__(self, context=None): self.context = context or decimal.getcontext() - def __repr__(self): - return "DecimalDtype(context={})".format(self.context) + def __repr__(self) -> str: + return f"DecimalDtype(context={self.context})" @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype. Returns ------- @@ -40,7 +41,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @property def _is_numeric(self): @@ -83,6 +84,12 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # if not all( @@ -108,6 +115,15 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: + # array, slice. + if pd.api.types.is_list_like(item): + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + elif pd.api.types.is_integer_dtype(dtype): + item = np.asarray(item, dtype="int") return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): @@ -166,14 +182,19 @@ def _concat_same_type(cls, to_concat): def _reduce(self, name, skipna=True, **kwargs): if skipna: - raise NotImplementedError("decimal does not support skipna=True") + # If we don't have any NAs, we can ignore skipna + if self.isna().any(): + other = self[~self.isna()] + return other._reduce(name, **kwargs) + + if name == "sum" and len(self) == 0: + # GH#29630 avoid returning int 0 or np.bool_(False) on old numpy + return decimal.Decimal(0) try: op = getattr(self.data, name) except AttributeError: - raise NotImplementedError( - "decimal does not support the {} operation".format(name) - ) + raise NotImplementedError(f"decimal does not support the {name} operation") return op(axis=0) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 86724d4d09819..de7c98ab96571 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import DecimalArray, DecimalDtype, make_data, to_decimal @@ -145,7 +145,7 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): class Reduce: def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ["median", "skew", "kurt"]: + if op_name in ["median", "skew", "kurt"]: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) @@ -478,3 +478,38 @@ def DecimalArray__my_sum(self): s = pd.Series(DecimalArray(data)) result = s.groupby(np.array([0, 0, 0, 1, 1])).agg(lambda x: x.values.my_sum()) tm.assert_series_equal(result, expected, check_names=False) + + +def test_indexing_no_materialize(monkeypatch): + # See https://github.com/pandas-dev/pandas/issues/29708 + # Ensure that indexing operations do not materialize (convert to a numpy + # array) the ExtensionArray unnecessary + + def DecimalArray__array__(self, dtype=None): + raise Exception("tried to convert a DecimalArray to a numpy array") + + monkeypatch.setattr(DecimalArray, "__array__", DecimalArray__array__, raising=False) + + data = make_data() + s = pd.Series(DecimalArray(data)) + df = pd.DataFrame({"a": s, "b": range(len(s))}) + + # ensure the following operations do not raise an error + s[s > 0.5] + df[s > 0.5] + s.at[0] + df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index b64ddbd6ac84d..17bc2773aad19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,8 @@ import numpy as np -from pandas.core.dtypes.base import ExtensionDtype - -from pandas.core.arrays import ExtensionArray +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype class JSONDtype(ExtensionDtype): @@ -31,7 +30,8 @@ class JSONDtype(ExtensionDtype): @classmethod def construct_array_type(cls): - """Return the array type associated with this dtype + """ + Return the array type associated with this dtype. Returns ------- @@ -44,7 +44,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") class JSONArray(ExtensionArray): @@ -75,17 +75,21 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == "bool": - return self._from_sequence([x for x, m in zip(self, item) if m]) - elif isinstance(item, abc.Iterable): - # fancy indexing - return type(self)([self.data[i] for i in item]) elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) - else: + elif isinstance(item, slice): # slice return type(self)(self.data[item]) + else: + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) def __setitem__(self, key, value): if isinstance(key, numbers.Integral): @@ -182,7 +186,7 @@ def _values_for_factorize(self): def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... - # If all the elemnts of self are the same size P, NumPy will + # If all the elements of self are the same size P, NumPy will # cast them to an (N, P) array, instead of an (N,) array of tuples. frozen = [()] + [tuple(x.items()) for x in self] return np.array(frozen, dtype=object)[1:] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index bc75ec6aeb2df..4d3145109e3c2 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -3,11 +3,9 @@ import pytest -from pandas.compat import PY36 - import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import JSONArray, JSONDtype, make_data @@ -95,6 +93,7 @@ def assert_series_equal(self, left, right, **kwargs): tm.assert_series_equal(left, right, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): + obj_type = kwargs.get("obj", "DataFrame") tm.assert_index_equal( left.columns, right.columns, @@ -102,7 +101,7 @@ def assert_frame_equal(self, left, right, *args, **kwargs): check_names=kwargs.get("check_names", True), check_exact=kwargs.get("check_exact", False), check_categorical=kwargs.get("check_categorical", True), - obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + obj=f"{obj_type}.columns", ) jsons = (left.dtypes == "json").index @@ -164,6 +163,10 @@ def test_unstack(self, data, index): # this matches otherwise return super().test_unstack(data, index) + @pytest.mark.xfail(reason="Inconsistent sizes.") + def test_transpose(self, data): + super().test_transpose(data) + class TestGetitem(BaseJSON, base.BaseGetitemTests): pass @@ -180,9 +183,6 @@ def test_fillna_frame(self): unhashable = pytest.mark.skip(reason="Unhashable") -unstable = pytest.mark.skipif( - not PY36, reason="Dictionary order unstable" # 3.6 or higher -) class TestReduce(base.BaseNoReduceTests): @@ -199,20 +199,16 @@ def test_sort_values_frame(self): # TODO (EA.factorize): see if _values_for_factorize allows this. pass - @unstable def test_argsort(self, data_for_sorting): super().test_argsort(data_for_sorting) - @unstable def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): super().test_sort_values(data_for_sorting, ascending) - @unstable @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super().test_sort_values_missing(data_missing_for_sorting, ascending) @@ -280,7 +276,6 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ - @unstable @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/extension/list/array.py b/pandas/tests/extension/list/array.py index 0ca9fadb68829..6dd00ad3b06ba 100644 --- a/pandas/tests/extension/list/array.py +++ b/pandas/tests/extension/list/array.py @@ -36,7 +36,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from '{}'".format(cls, string)) + raise TypeError(f"Cannot construct a '{cls}' from '{string}'") class ListArray(ExtensionArray): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py new file mode 100644 index 0000000000000..a7ce0fb097599 --- /dev/null +++ b/pandas/tests/extension/test_boolean.py @@ -0,0 +1,345 @@ +""" +This file contains a minimal set of tests for compliance with the extension +array interface test suite, and should contain no other tests. +The test suite for the full functionality of the array is located in +`pandas/tests/arrays/`. + +The tests in this file are inherited from the BaseExtensionTests, and only +minimal tweaks should be applied to get the tests passing (by overwriting a +parent method). + +Additional tests should either be added to one of the BaseExtensionTests +classes (if they are relevant for the extension interface for all dtypes), or +be added to the array-specific tests in `pandas/tests/arrays/`. + +""" +import numpy as np +import pytest + +from pandas.compat.numpy import _np_version_under1p14 + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype +from pandas.tests.extension import base + + +def make_data(): + return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False] + + +@pytest.fixture +def dtype(): + return BooleanDtype() + + +@pytest.fixture +def data(dtype): + return pd.array(make_data(), dtype=dtype) + + +@pytest.fixture +def data_for_twos(dtype): + return pd.array(np.ones(100), dtype=dtype) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, True], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(dtype): + return pd.array([True, True, False], dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + return pd.array([True, np.nan, False], dtype=dtype) + + +@pytest.fixture +def na_cmp(): + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA + + +@pytest.fixture +def na_value(): + return pd.NA + + +@pytest.fixture +def data_for_grouping(dtype): + b = True + a = False + na = np.nan + return pd.array([b, b, na, na, a, a, b], dtype=dtype) + + +class TestDtype(base.BaseDtypeTests): + pass + + +class TestInterface(base.BaseInterfaceTests): + pass + + +class TestConstructors(base.BaseConstructorsTests): + pass + + +class TestGetitem(base.BaseGetitemTests): + pass + + +class TestSetitem(base.BaseSetitemTests): + pass + + +class TestMissing(base.BaseMissingTests): + pass + + +class TestArithmeticOps(base.BaseArithmeticOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + if op_name in ("__sub__", "__rsub__"): + # subtraction for bools raises TypeError (but not yet in 1.13) + if _np_version_under1p14: + pytest.skip("__sub__ does not yet raise in numpy 1.13") + with pytest.raises(TypeError): + op(s, other) + + return + + result = op(s, other) + expected = s.combine(other, op) + + if op_name in ( + "__floordiv__", + "__rfloordiv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", + ): + # combine keeps boolean type + expected = expected.astype("Int8") + elif op_name in ("__truediv__", "__rtruediv__"): + # combine with bools does not generate the correct result + # (numpy behaviour for div is to regard the bools as numeric) + expected = s.astype(float).combine(other, op) + if op_name == "__rpow__": + # for rpow, combine does not propagate NaN + expected[result.isna()] = np.nan + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + + def _check_divmod_op(self, s, op, other, exc=None): + # override to not raise an error + super()._check_divmod_op(s, op, other, None) + + @pytest.mark.skip(reason="BooleanArray does not error on ops") + def test_error(self, data, all_arithmetic_operators): + # other specific errors tested in the boolean array specific tests + pass + + +class TestComparisonOps(base.BaseComparisonOpsTests): + def check_opname(self, s, op_name, other, exc=None): + # overwriting to indicate ops don't raise an error + super().check_opname(s, op_name, other, exc=None) + + def _compare_other(self, s, data, op_name, other): + self.check_opname(s, op_name, other) + + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_scalar(self, data, all_compare_operators): + pass + + @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py") + def test_compare_array(self, data, all_compare_operators): + pass + + +class TestReshaping(base.BaseReshapingTests): + pass + + +class TestMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel): + # override because we only have 2 unique values + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp + ) + expected_uniques = data_for_grouping.take([0, 4]) + + tm.assert_numpy_array_equal(labels, expected_labels) + self.assert_extension_array_equal(uniques, expected_uniques) + + def test_combine_le(self, data_repeated): + # override because expected needs to be boolean instead of bool dtype + orig_data1, orig_data2 = data_repeated(2) + s1 = pd.Series(orig_data1) + s2 = pd.Series(orig_data2) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + dtype="boolean", + ) + self.assert_series_equal(result, expected) + + val = s1.iloc[0] + result = s1.combine(val, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= val for a in list(orig_data1)], dtype="boolean") + self.assert_series_equal(result, expected) + + def test_searchsorted(self, data_for_sorting, as_series): + # override because we only have 2 unique values + data_for_sorting = pd.array([True, False], dtype="boolean") + b, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b]) + + if as_series: + arr = pd.Series(arr) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + result = arr.searchsorted(arr.take([0, 1])) + expected = np.array([0, 1], dtype=np.intp) + + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + + +class TestCasting(base.BaseCastingTests): + pass + + +class TestGroupby(base.BaseGroupbyTests): + """ + Groupby-specific tests are overridden because boolean only has 2 + unique values, base tests uses 3 groups. + """ + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B"], "B": data_for_grouping} + ) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + + index = pd.Index(index, name="B") + expected = pd.Series([3, 1], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + + index = pd.Index(index, name="B") + expected = pd.Series([1, 3], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1], "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3], name="A") + + self.assert_series_equal(result, expected) + + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + df.groupby("B").apply(groupby_apply_op) + df.groupby("B").A.apply(groupby_apply_op) + df.groupby("A").apply(groupby_apply_op) + df.groupby("A").B.apply(groupby_apply_op) + + def test_groupby_apply_identity(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + ], + index=pd.Index([1, 2, 3], name="A"), + name="B", + ) + self.assert_series_equal(result, expected) + + def test_in_numeric_groupby(self, data_for_grouping): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1], + } + ) + result = df.groupby("A").sum().columns + + if data_for_grouping.dtype._is_numeric: + expected = pd.Index(["B", "C"]) + else: + expected = pd.Index(["C"]) + + tm.assert_index_equal(result, expected) + + +class TestNumericReduce(base.BaseNumericReduceTests): + def check_reduce(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + # override parent function to cast to bool for min/max + if op_name in ("min", "max") and not pd.isna(expected): + expected = bool(expected) + tm.assert_almost_equal(result, expected) + + +class TestBooleanReduce(base.BaseBooleanReduceTests): + pass + + +class TestPrinting(base.BasePrintingTests): + pass + + +# TODO parsing not yet supported +# class TestParsing(base.BaseParsingTests): +# pass diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 528053aa8c7f1..336b23e54d74c 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -20,9 +20,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Timestamp +import pandas._testing as tm from pandas.api.types import CategoricalDtype from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(): @@ -93,10 +93,7 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_ravel(self, data): - # GH#27199 Categorical.ravel returns self until after deprecation cycle - with tm.assert_produces_warning(FutureWarning): - data.ravel() + pass class TestGetitem(base.BaseGetitemTests): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 9b5f9d64f6b67..e43650c291200 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -5,8 +5,8 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import ExtensionArray -import pandas.util.testing as tm class DummyDtype(dtypes.ExtensionDtype): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..afb8412f12ea9 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -34,7 +34,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +65,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +75,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +94,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) @@ -142,6 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -162,6 +163,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -198,7 +209,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.skip(reason="uses nullable integer") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 4fdcf930d224f..2411f6cfbd936 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,7 +147,9 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - pass + @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) class TestPrinting(BaseInterval, base.BasePrintingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 221cf0787d839..7db38f41d4573 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -4,8 +4,8 @@ from pandas.compat.numpy import _np_version_under1p16 import pandas as pd +import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype -import pandas.util.testing as tm from . import base @@ -51,7 +51,7 @@ def data_missing(allow_in_pandas, dtype): if dtype.numpy_dtype == "object": if _np_version_under1p16: raise pytest.skip("Skipping for NumPy <1.16") - return PandasArray(np.array([np.nan, (1,)])) + return PandasArray(np.array([np.nan, (1,)], dtype=object)) return PandasArray(np.array([np.nan, 1.0])) @@ -78,7 +78,7 @@ def data_for_sorting(allow_in_pandas, dtype): if dtype.numpy_dtype == "object": # Use an empty tuple for first element, then remove, # to disable np.array's shape inference. - return PandasArray(np.array([(), (2,), (3,), (1,)])[1:]) + return PandasArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:]) return PandasArray(np.array([1, 2, 0])) @@ -90,7 +90,7 @@ def data_missing_for_sorting(allow_in_pandas, dtype): A < B and NA missing. """ if dtype.numpy_dtype == "object": - return PandasArray(np.array([(1,), np.nan, (0,)])) + return PandasArray(np.array([(1,), np.nan, (0,)], dtype=object)) return PandasArray(np.array([1, np.nan, 0])) @@ -106,7 +106,9 @@ def data_for_grouping(allow_in_pandas, dtype): a, b, c = (1,), (2,), (3,) else: a, b, c = np.arange(3) - return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c])) + return PandasArray( + np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype) + ) @pytest.fixture @@ -330,6 +332,10 @@ def test_merge_on_extension_array_duplicates(self, data): # Fails creating expected super().test_merge_on_extension_array_duplicates(data) + @skip_nested + def test_transpose(self, data): + super().test_transpose(data) + class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): @skip_nested diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 6ebe71e173ec2..198a228b621b4 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -4,9 +4,10 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import SparseArray, SparseDtype +from pandas import SparseDtype +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(fill_value): @@ -132,6 +133,10 @@ def test_concat_columns(self, data, na_value): self._check_unsupported(data) super().test_concat_columns(data, na_value) + def test_concat_extension_arrays_copy_false(self, data, na_value): + self._check_unsupported(data) + super().test_concat_extension_arrays_copy_false(data, na_value) + def test_align(self, data, na_value): self._check_unsupported(data) super().test_align(data, na_value) @@ -231,7 +236,7 @@ def test_combine_le(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray( + SparseArray( [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], fill_value=False, ) @@ -241,7 +246,7 @@ def test_combine_le(self, data_repeated): val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + SparseArray([a <= val for a in list(orig_data1)], fill_value=False) ) self.assert_series_equal(result, expected) @@ -346,7 +351,7 @@ def _compare_other(self, s, data, op_name, other): with np.errstate(all="ignore"): expected = pd.Series( - pd.SparseArray( + SparseArray( op(np.asarray(data), np.asarray(other)), fill_value=result.values.fill_value, ) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 5b872d5b72227..86aed671f1b88 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -25,7 +25,7 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return StringArray._from_sequence([np.nan, "A"]) + return StringArray._from_sequence([pd.NA, "A"]) @pytest.fixture @@ -35,17 +35,17 @@ def data_for_sorting(): @pytest.fixture def data_missing_for_sorting(): - return StringArray._from_sequence(["B", np.nan, "A"]) + return StringArray._from_sequence(["B", pd.NA, "A"]) @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture def data_for_grouping(): - return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"]) + return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) class TestDtype(base.BaseDtypeTests): @@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - pass + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) class TestCasting(base.BaseCastingTests): @@ -91,7 +93,7 @@ class TestCasting(base.BaseCastingTests): class TestComparisonOps(base.BaseComparisonOpsTests): def _compare_other(self, s, data, op_name, other): result = getattr(s, op_name)(other) - expected = getattr(s.astype(object), op_name)(other) + expected = getattr(s.astype(object), op_name)(other).astype("boolean") self.assert_series_equal(result, expected) def test_compare_scalar(self, data, all_compare_operators): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 915d6edcd8367..774eb443c45fe 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, NaT, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py new file mode 100644 index 0000000000000..5de38915f04c1 --- /dev/null +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -0,0 +1,388 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import Categorical, DataFrame, Index, Series +import pandas._testing as tm + + +class TestDataFrameIndexingCategorical: + def test_assignment(self): + # assignment + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + d = s.values + df["D"] = d + str(df) + + result = df.dtypes + expected = Series( + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) + tm.assert_series_equal(result, expected) + + df["E"] = s + str(df) + + result = df.dtypes + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) + tm.assert_series_equal(result, expected) + + result1 = df["D"] + result2 = df["E"] + tm.assert_categorical_equal(result1._data._block.values, d) + + # sorting + s.name = "E" + tm.assert_series_equal(result2.sort_index(), s.sort_index()) + + cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) + df = DataFrame(Series(cat)) + + def test_assigning_ops(self): + # systematically test the assigning operations: + # for all slicing ops: + # for value in categories and value not in categories: + + # - assign a single value -> exp_single_cats_value + + # - assign a complete row (mixed values) -> exp_single_row + + # assign multiple rows (mixed values) (-> array) -> exp_multi_row + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + + # the expected values + # changed single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + + # changed part of the cats column + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values3 = [1, 1, 1, 1, 1, 1, 1] + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + + # changed single value in cats col + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values4 = [1, 1, 1, 1, 1, 1, 1] + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) + + # iloc + # ############### + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.iloc[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.iloc[df.index == "j", 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, 0] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.iloc[2, :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2, :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.iloc[2:4, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.iloc[2:4, :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.iloc[2:4, 0] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.iloc[2:4, 0] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", "cats"] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", "cats"] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", "cats"] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", "cats"] = ["c", "c"] + + # loc + # ############## + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + df = orig.copy() + df.loc[df.index == "j", df.columns[0]] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", df.columns[0]] = "c" + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j", :] = ["b", 2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in categories set + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j", :] = ["c", 2] + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + with pytest.raises(ValueError): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + # assign a part of a column with dtype == categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + # different categories -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["b", "b"], categories=["a", "b", "c"] + ) + + with pytest.raises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k", df.columns[0]] = Categorical( + ["c", "c"], categories=["a", "b", "c"] + ) + + # assign a part of a column with dtype != categorical -> + # exp_parts_cats_col + df = orig.copy() + df.loc["j":"k", df.columns[0]] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with pytest.raises(ValueError): + df.loc["j":"k", df.columns[0]] = ["c", "c"] + + # iat + df = orig.copy() + df.iat[2, 0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.iat[2, 0] = "c" + + # at + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current categories set + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # fancy indexing + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) + valuesf = [1, 1, 3, 3, 1, 1, 1] + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) + + df[df["cats"] == "c"] = ["b", 2] + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) + + # set_value + df = orig.copy() + df.at["j", "cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + with pytest.raises(ValueError): + df = orig.copy() + df.at["j", "cats"] = "c" + + # Assigning a Category to parts of a int/... column uses the values of + # the Categorical + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) + tm.assert_frame_equal(df, exp) + + def test_functions_no_warnings(self): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) + + def test_loc_indexing_preserves_index_category_dtype(self): + # GH 15166 + df = DataFrame( + data=np.arange(2, 22, 2), + index=pd.MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) + + expected = pd.CategoricalIndex( + ["a", "b"], + categories=["a", "b"], + ordered=False, + name="Index1", + dtype="category", + ) + + result = df.index.levels[0] + tm.assert_index_equal(result, expected) + + result = df.loc[["a"]].index.levels[0] + tm.assert_index_equal(result, expected) + + def test_wrong_length_cat_dtype_raises(self): + # GH29523 + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) + df = pd.DataFrame({"bar": range(10)}) + err = "Length of values does not match length of index" + with pytest.raises(ValueError, match=err): + df["foo"] = cat diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py new file mode 100644 index 0000000000000..a1c12be2b0180 --- /dev/null +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -0,0 +1,62 @@ +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, notna +import pandas._testing as tm + + +class TestDataFrameIndexingDatetimeWithTZ: + def test_setitem(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + assert id(b1.values._data.base) != id(b2.values._data.base) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) + + def test_set_reset(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype, "M8[ns, US/Eastern" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + + def test_transpose(self, timezone_frame): + + result = timezone_frame.T + expected = DataFrame(timezone_frame.values.T) + expected.index = ["A", "B", "C"] + tm.assert_frame_equal(result, expected) + + def test_scalar_assignment(self): + # issue #19843 + df = pd.DataFrame(index=(0, 1, 2)) + df["now"] = pd.Timestamp("20130101", tz="UTC") + expected = pd.DataFrame( + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py similarity index 57% rename from pandas/tests/frame/test_indexing.py rename to pandas/tests/frame/indexing/test_indexing.py index e215c90d2eb04..33c0e92845484 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1,18 +1,15 @@ from datetime import date, datetime, time, timedelta import re -from warnings import catch_warnings, simplefilter import numpy as np import pytest from pandas._libs.tslib import iNaT -from pandas.core.dtypes.common import is_float_dtype, is_integer, is_scalar -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, @@ -23,9 +20,10 @@ isna, notna, ) +import pandas._testing as tm +from pandas.arrays import SparseArray import pandas.core.common as com from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -398,10 +396,8 @@ def test_getitem_ix_mixed_integer(self): expected = df.loc[df.index[:-1]] tm.assert_frame_equal(result, expected) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[[1, 10]] - expected = df.ix[Index([1, 10], dtype=object)] + result = df.loc[[1, 10]] + expected = df.loc[Index([1, 10])] tm.assert_frame_equal(result, expected) # 11320 @@ -421,53 +417,6 @@ def test_getitem_ix_mixed_integer(self): expected = df.iloc[:, [1]] tm.assert_frame_equal(result, expected) - def test_getitem_setitem_ix_negative_integers(self, float_frame): - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = float_frame.ix[:, -1] - tm.assert_series_equal(result, float_frame["D"]) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = float_frame.ix[:, [-1]] - tm.assert_frame_equal(result, float_frame[["D"]]) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = float_frame.ix[:, [-1, -2]] - tm.assert_frame_equal(result, float_frame[["D", "C"]]) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - float_frame.ix[:, [-1]] = 0 - assert (float_frame["D"] == 0).all() - - df = DataFrame(np.random.randn(8, 4)) - # ix does label-based indexing when having an integer index - msg = "\"None of [Int64Index([-1], dtype='int64')] are in the [index]\"" - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - with pytest.raises(KeyError, match=re.escape(msg)): - df.ix[[-1]] - - msg = "\"None of [Int64Index([-1], dtype='int64')] are in the [columns]\"" - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - with pytest.raises(KeyError, match=re.escape(msg)): - df.ix[:, [-1]] - - # #1942 - a = DataFrame(np.random.randn(20, 2), index=[chr(x + 65) for x in range(20)]) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - a.ix[-1] = a.ix[-2] - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_series_equal(a.ix[-1], a.ix[-2], check_names=False) - assert a.ix[-1].name == "T" - assert a.ix[-2].name == "S" - def test_getattr(self, float_frame): tm.assert_series_equal(float_frame.A, float_frame["A"]) msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" @@ -850,55 +799,6 @@ def test_delitem_corner(self, float_frame): del f["B"] assert len(f.columns) == 2 - def test_getitem_fancy_2d(self, float_frame): - f = float_frame - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(f.ix[:, ["B", "A"]], f.reindex(columns=["B", "A"])) - - subidx = float_frame.index[[5, 4, 1]] - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal( - f.ix[subidx, ["B", "A"]], f.reindex(index=subidx, columns=["B", "A"]) - ) - - # slicing rows, etc. - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(f.ix[5:10], f[5:10]) - tm.assert_frame_equal(f.ix[5:10, :], f[5:10]) - tm.assert_frame_equal( - f.ix[:5, ["A", "B"]], f.reindex(index=f.index[:5], columns=["A", "B"]) - ) - - # slice rows with labels, inclusive! - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - expected = f.ix[5:11] - result = f.ix[f.index[5] : f.index[10]] - tm.assert_frame_equal(expected, result) - - # slice columns - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(f.ix[:, :2], f.reindex(columns=["A", "B"])) - - # get view - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - exp = f.copy() - f.ix[5:10].values[:] = 5 - exp.values[5:10] = 5 - tm.assert_frame_equal(f, exp) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - msg = "Cannot index with multidimensional key" - with pytest.raises(ValueError, match=msg): - f.ix[f > 0.5] - def test_slice_floats(self): index = [52195.504153, 52196.303147, 52198.369883] df = DataFrame(np.random.rand(3, 2), index=index) @@ -947,119 +847,6 @@ def test_getitem_setitem_integer_slice_keyerrors(self): with pytest.raises(KeyError, match=r"^3$"): df2.loc[3:11] = 0 - def test_setitem_fancy_2d(self, float_frame): - - # case 1 - frame = float_frame.copy() - expected = frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[:, ["B", "A"]] = 1 - expected["B"] = 1.0 - expected["A"] = 1.0 - tm.assert_frame_equal(frame, expected) - - # case 2 - frame = float_frame.copy() - frame2 = float_frame.copy() - - expected = frame.copy() - - subidx = float_frame.index[[5, 4, 1]] - values = np.random.randn(3, 2) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[subidx, ["B", "A"]] = values - frame2.ix[[5, 4, 1], ["B", "A"]] = values - - expected["B"].ix[subidx] = values[:, 0] - expected["A"].ix[subidx] = values[:, 1] - - tm.assert_frame_equal(frame, expected) - tm.assert_frame_equal(frame2, expected) - - # case 3: slicing rows, etc. - frame = float_frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - expected1 = float_frame.copy() - frame.ix[5:10] = 1.0 - expected1.values[5:10] = 1.0 - tm.assert_frame_equal(frame, expected1) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - expected2 = float_frame.copy() - arr = np.random.randn(5, len(frame.columns)) - frame.ix[5:10] = arr - expected2.values[5:10] = arr - tm.assert_frame_equal(frame, expected2) - - # case 4 - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame = float_frame.copy() - frame.ix[5:10, :] = 1.0 - tm.assert_frame_equal(frame, expected1) - frame.ix[5:10, :] = arr - tm.assert_frame_equal(frame, expected2) - - # case 5 - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame = float_frame.copy() - frame2 = float_frame.copy() - - expected = float_frame.copy() - values = np.random.randn(5, 2) - - frame.ix[:5, ["A", "B"]] = values - expected["A"][:5] = values[:, 0] - expected["B"][:5] = values[:, 1] - tm.assert_frame_equal(frame, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame2.ix[:5, [0, 1]] = values - tm.assert_frame_equal(frame2, expected) - - # case 6: slice rows with labels, inclusive! - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame = float_frame.copy() - expected = float_frame.copy() - - frame.ix[frame.index[5] : frame.index[10]] = 5.0 - expected.values[5:11] = 5 - tm.assert_frame_equal(frame, expected) - - # case 7: slice columns - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame = float_frame.copy() - frame2 = float_frame.copy() - expected = float_frame.copy() - - # slice indices - frame.ix[:, 1:3] = 4.0 - expected.values[:, 1:3] = 4.0 - tm.assert_frame_equal(frame, expected) - - # slice with labels - frame.ix[:, "B":"C"] = 4.0 - tm.assert_frame_equal(frame, expected) - - # new corner case of boolean slicing / setting - frame = DataFrame(zip([2, 3, 9, 6, 7], [np.nan] * 5), columns=["a", "b"]) - lst = [100] - lst.extend([np.nan] * 4) - expected = DataFrame(zip([100, 3, 9, 6, 7], lst), columns=["a", "b"]) - frame[frame["a"] == 2] = 100 - tm.assert_frame_equal(frame, expected) - def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -1073,194 +860,6 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): assert (float_frame["C"] == 4).all() - def test_fancy_setitem_int_labels(self): - # integer index defers to label-based indexing - - df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tmp = df.copy() - exp = df.copy() - tmp.ix[[0, 2, 4]] = 5 - exp.values[:3] = 5 - tm.assert_frame_equal(tmp, exp) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tmp = df.copy() - exp = df.copy() - tmp.ix[6] = 5 - exp.values[3] = 5 - tm.assert_frame_equal(tmp, exp) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tmp = df.copy() - exp = df.copy() - tmp.ix[:, 2] = 5 - - # tmp correctly sets the dtype - # so match the exp way - exp[2] = 5 - tm.assert_frame_equal(tmp, exp) - - def test_fancy_getitem_int_labels(self): - df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[[4, 2, 0], [2, 0]] - expected = df.reindex(index=[4, 2, 0], columns=[2, 0]) - tm.assert_frame_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[[4, 2, 0]] - expected = df.reindex(index=[4, 2, 0]) - tm.assert_frame_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[4] - expected = df.xs(4) - tm.assert_series_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[:, 3] - expected = df[3] - tm.assert_series_equal(result, expected) - - def test_fancy_index_int_labels_exceptions(self, float_frame): - df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - - # labels that aren't contained - with pytest.raises(KeyError, match=r"\[1\] not in index"): - df.ix[[0, 1, 2], [2, 3, 4]] = 5 - - # try to set indices not contained in frame - msg = ( - r"None of \[Index\(\['foo', 'bar', 'baz'\]," - r" dtype='object'\)\] are in the \[index\]" - ) - with pytest.raises(KeyError, match=msg): - float_frame.ix[["foo", "bar", "baz"]] = 1 - msg = ( - r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" - r" \[columns\]" - ) - with pytest.raises(KeyError, match=msg): - float_frame.ix[:, ["E"]] = 1 - - # FIXME: don't leave commented-out - # partial setting now allows this GH2578 - # pytest.raises(KeyError, float_frame.ix.__setitem__, - # (slice(None, None), 'E'), 1) - - def test_setitem_fancy_mixed_2d(self, float_string_frame): - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - float_string_frame.ix[:5, ["C", "B", "A"]] = 5 - result = float_string_frame.ix[:5, ["C", "B", "A"]] - assert (result.values == 5).all() - - float_string_frame.ix[5] = np.nan - assert isna(float_string_frame.ix[5]).all() - - float_string_frame.ix[5] = float_string_frame.ix[6] - tm.assert_series_equal( - float_string_frame.ix[5], float_string_frame.ix[6], check_names=False - ) - - # #1432 - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df = DataFrame({1: [1.0, 2.0, 3.0], 2: [3, 4, 5]}) - assert df._is_mixed_type - - df.ix[1] = [5, 10] - - expected = DataFrame({1: [1.0, 5.0, 3.0], 2: [3, 10, 5]}) - - tm.assert_frame_equal(df, expected) - - def test_ix_align(self): - b = Series(np.random.randn(10), name=0).sort_values() - df_orig = DataFrame(np.random.randn(10, 4)) - df = df_orig.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df.ix[:, 0] = b - tm.assert_series_equal(df.ix[:, 0].reindex(b.index), b) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - dft = df_orig.T - dft.ix[0, :] = b - tm.assert_series_equal(dft.ix[0, :].reindex(b.index), b) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df = df_orig.copy() - df.ix[:5, 0] = b - s = df.ix[:5, 0] - tm.assert_series_equal(s, b.reindex(s.index)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - dft = df_orig.T - dft.ix[0, :5] = b - s = dft.ix[0, :5] - tm.assert_series_equal(s, b.reindex(s.index)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df = df_orig.copy() - idx = [0, 1, 3, 5] - df.ix[idx, 0] = b - s = df.ix[idx, 0] - tm.assert_series_equal(s, b.reindex(s.index)) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - dft = df_orig.T - dft.ix[0, idx] = b - s = dft.ix[0, idx] - tm.assert_series_equal(s, b.reindex(s.index)) - - def test_ix_frame_align(self): - b = DataFrame(np.random.randn(3, 4)) - df_orig = DataFrame(np.random.randn(10, 4)) - df = df_orig.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df.ix[:3] = b - out = b.ix[:3] - tm.assert_frame_equal(out, b) - - b.sort_index(inplace=True) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df = df_orig.copy() - df.ix[[0, 1, 2]] = b - out = df.ix[[0, 1, 2]].reindex(b.index) - tm.assert_frame_equal(out, b) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - df = df_orig.copy() - df.ix[:3] = b - out = df.ix[:3] - tm.assert_frame_equal(out, b.reindex(out.index)) - def test_getitem_setitem_non_ix_labels(self): df = tm.makeTimeDataFrame() @@ -1287,6 +886,7 @@ def test_ix_multi_take(self): xp = df.reindex([0]) tm.assert_frame_equal(rs, xp) + # FIXME: dont leave commented-out """ #1321 df = DataFrame(np.random.randn(3, 2)) rs = df.loc[df.index==0, df.columns==1] @@ -1294,168 +894,6 @@ def test_ix_multi_take(self): tm.assert_frame_equal(rs, xp) """ - def test_ix_multi_take_nonint_index(self): - df = DataFrame(np.random.randn(3, 2), index=["x", "y", "z"], columns=["a", "b"]) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - rs = df.ix[[0], [0]] - xp = df.reindex(["x"], columns=["a"]) - tm.assert_frame_equal(rs, xp) - - def test_ix_multi_take_multiindex(self): - df = DataFrame( - np.random.randn(3, 2), - index=["x", "y", "z"], - columns=[["a", "b"], ["1", "2"]], - ) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - rs = df.ix[[0], [0]] - xp = df.reindex(["x"], columns=[("a", "1")]) - tm.assert_frame_equal(rs, xp) - - def test_ix_dup(self): - idx = Index(["a", "a", "b", "c", "d", "d"]) - df = DataFrame(np.random.randn(len(idx), 3), idx) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - sub = df.ix[:"d"] - tm.assert_frame_equal(sub, df) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - sub = df.ix["a":"c"] - tm.assert_frame_equal(sub, df.ix[0:4]) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - sub = df.ix["b":"d"] - tm.assert_frame_equal(sub, df.ix[2:]) - - def test_getitem_fancy_1d(self, float_frame, float_string_frame): - f = float_frame - - # return self if no slicing...for now - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - assert f.ix[:, :] is f - - # low dimensional slice - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - xs1 = f.ix[2, ["C", "B", "A"]] - xs2 = f.xs(f.index[2]).reindex(["C", "B", "A"]) - tm.assert_series_equal(xs1, xs2) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - ts1 = f.ix[5:10, 2] - ts2 = f[f.columns[2]][5:10] - tm.assert_series_equal(ts1, ts2) - - # positional xs - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - xs1 = f.ix[0] - xs2 = f.xs(f.index[0]) - tm.assert_series_equal(xs1, xs2) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - xs1 = f.ix[f.index[5]] - xs2 = f.xs(f.index[5]) - tm.assert_series_equal(xs1, xs2) - - # single column - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_series_equal(f.ix[:, "A"], f["A"]) - - # return view - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - exp = f.copy() - exp.values[5] = 4 - f.ix[5][:] = 4 - tm.assert_frame_equal(exp, f) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - exp.values[:, 1] = 6 - f.ix[:, 1][:] = 6 - tm.assert_frame_equal(exp, f) - - # slice of mixed-frame - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - xs = float_string_frame.ix[5] - exp = float_string_frame.xs(float_string_frame.index[5]) - tm.assert_series_equal(xs, exp) - - def test_setitem_fancy_1d(self, float_frame): - - # case 1: set cross-section for indices - frame = float_frame.copy() - expected = float_frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[2, ["C", "B", "A"]] = [1.0, 2.0, 3.0] - expected["C"][2] = 1.0 - expected["B"][2] = 2.0 - expected["A"][2] = 3.0 - tm.assert_frame_equal(frame, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame2 = float_frame.copy() - frame2.ix[2, [3, 2, 1]] = [1.0, 2.0, 3.0] - tm.assert_frame_equal(frame, expected) - - # case 2, set a section of a column - frame = float_frame.copy() - expected = float_frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - vals = np.random.randn(5) - expected.values[5:10, 2] = vals - frame.ix[5:10, 2] = vals - tm.assert_frame_equal(frame, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame2 = float_frame.copy() - frame2.ix[5:10, "B"] = vals - tm.assert_frame_equal(frame, expected) - - # case 3: full xs - frame = float_frame.copy() - expected = float_frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[4] = 5.0 - expected.values[4] = 5.0 - tm.assert_frame_equal(frame, expected) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[frame.index[4]] = 6.0 - expected.values[4] = 6.0 - tm.assert_frame_equal(frame, expected) - - # single column - frame = float_frame.copy() - expected = float_frame.copy() - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - frame.ix[:, "A"] = 7.0 - expected["A"] = 7.0 - tm.assert_frame_equal(frame, expected) - def test_getitem_fancy_scalar(self, float_frame): f = float_frame ix = f.loc @@ -1709,18 +1147,18 @@ def test_setitem_mixed_datetime(self): { "a": [0, 0, 0, 0, 13, 14], "b": [ - pd.datetime(2012, 1, 1), + datetime(2012, 1, 1), 1, "x", "y", - pd.datetime(2013, 1, 1), - pd.datetime(2014, 1, 1), + datetime(2013, 1, 1), + datetime(2014, 1, 1), ], } ) df = pd.DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT - df.loc[0, "b"] = pd.datetime(2012, 1, 1) + df.loc[0, "b"] = datetime(2012, 1, 1) df.loc[1, "b"] = 1 df.loc[[2, 3], "b"] = "x", "y" A = np.array( @@ -1977,15 +1415,11 @@ def test_get_set_value_no_partial_indexing(self): with pytest.raises(KeyError, match=r"^0$"): df._get_value(0, 1) + # TODO: rename? remove? def test_single_element_ix_dont_upcast(self, float_frame): float_frame["E"] = 1 assert issubclass(float_frame["E"].dtype.type, (int, np.integer)) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = float_frame.ix[float_frame.index[5], "E"] - assert is_integer(result) - result = float_frame.loc[float_frame.index[5], "E"] assert is_integer(result) @@ -1993,18 +1427,10 @@ def test_single_element_ix_dont_upcast(self, float_frame): df = pd.DataFrame(dict(a=[1.23])) df["b"] = 666 - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[0, "b"] - assert is_integer(result) result = df.loc[0, "b"] assert is_integer(result) expected = Series([666], [0], name="b") - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[[0], "b"] - tm.assert_series_equal(result, expected) result = df.loc[[0], "b"] tm.assert_series_equal(result, expected) @@ -2072,45 +1498,12 @@ def test_iloc_duplicates(self): df = DataFrame(np.random.rand(3, 3), columns=list("ABC"), index=list("aab")) result = df.iloc[0] - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result2 = df.ix[0] assert isinstance(result, Series) tm.assert_almost_equal(result.values, df.values[0]) - tm.assert_series_equal(result, result2) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.T.iloc[:, 0] - result2 = df.T.ix[:, 0] + result = df.T.iloc[:, 0] assert isinstance(result, Series) tm.assert_almost_equal(result.values, df.values[0]) - tm.assert_series_equal(result, result2) - - # multiindex - df = DataFrame( - np.random.randn(3, 3), - columns=[["i", "i", "j"], ["A", "A", "B"]], - index=[["i", "i", "j"], ["X", "X", "Y"]], - ) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - rs = df.iloc[0] - xp = df.ix[0] - tm.assert_series_equal(rs, xp) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - rs = df.iloc[:, 0] - xp = df.T.ix[0] - tm.assert_series_equal(rs, xp) - - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - rs = df.iloc[:, [0]] - xp = df.ix[:, [0]] - tm.assert_frame_equal(rs, xp) # #2259 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1, 1, 2]) @@ -2355,9 +1748,6 @@ def test_getitem_ix_float_duplicates(self): ) expect = df.iloc[1:] tm.assert_frame_equal(df.loc[0.2], expect) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) @@ -2365,9 +1755,6 @@ def test_getitem_ix_float_duplicates(self): df.index = [1, 0.2, 0.2] expect = df.iloc[1:] tm.assert_frame_equal(df.loc[0.2], expect) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) @@ -2377,9 +1764,6 @@ def test_getitem_ix_float_duplicates(self): ) expect = df.iloc[1:-1] tm.assert_frame_equal(df.loc[0.2], expect) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:-1, 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) @@ -2387,16 +1771,13 @@ def test_getitem_ix_float_duplicates(self): df.index = [0.1, 0.2, 2, 0.2] expect = df.iloc[[1, -1]] tm.assert_frame_equal(df.loc[0.2], expect) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - tm.assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[[1, -1], 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) def test_getitem_sparse_column(self): # https://github.com/pandas-dev/pandas/issues/23559 - data = pd.SparseArray([0, 1]) + data = SparseArray([0, 1]) df = pd.DataFrame({"A": data}) expected = pd.Series(data, name="A") result = df["A"] @@ -2411,7 +1792,7 @@ def test_getitem_sparse_column(self): def test_setitem_with_sparse_value(self): # GH8131 df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_array = pd.SparseArray([0, 0, 1]) + sp_array = SparseArray([0, 0, 1]) df["new_column"] = sp_array tm.assert_series_equal( df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False @@ -2419,9 +1800,9 @@ def test_setitem_with_sparse_value(self): def test_setitem_with_unaligned_sparse_value(self): df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) + sp_series = pd.Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) df["new_column"] = sp_series - exp = pd.Series(pd.SparseArray([1, 0, 0]), name="new_column") + exp = pd.Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): @@ -2574,7 +1955,7 @@ def test_xs_corner(self): # no columns but Index(dtype=object) df = DataFrame(index=["a", "b", "c"]) result = df.xs("a") - expected = Series([], name="a", index=pd.Index([], dtype=object)) + expected = Series([], name="a", index=pd.Index([]), dtype=np.float64) tm.assert_series_equal(result, expected) def test_xs_duplicates(self): @@ -2618,14 +1999,20 @@ def test_index_namedtuple(self): index = Index([idx1, idx2], name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - result = df.ix[IndexType("foo", "bar")]["A"] - assert result == 1 - result = df.loc[IndexType("foo", "bar")]["A"] assert result == 1 + @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) + def test_index_single_double_tuples(self, tpl): + # GH 20991 + idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) + df = DataFrame(index=idx) + + result = df.loc[[tpl]] + idx = pd.Index([tpl], name="A", tupleize_cols=False) + expected = DataFrame(index=idx) + tm.assert_frame_equal(result, expected) + def test_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] @@ -2695,576 +2082,6 @@ def test_boolean_indexing_mixed(self): with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) - - def _safe_add(df): - # only add to the numeric items - def is_ok(s): - return ( - issubclass(s.dtype.type, (np.integer, np.floating)) - and s.dtype != "uint8" - ) - - return DataFrame( - dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) - ) - - def _check_get(df, cond, check_dtypes=True): - other1 = _safe_add(df) - rs = df.where(cond, other1) - rs2 = df.where(cond.values, other1) - for k, v in rs.items(): - exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) - tm.assert_series_equal(v, exp, check_names=False) - tm.assert_frame_equal(rs, rs2) - - # dtypes - if check_dtypes: - assert (rs.dtypes == df.dtypes).all() - - # check getting - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - cond = df > 0 - _check_get(df, cond) - - # upcasting case (GH # 2794) - df = DataFrame( - { - c: Series([1] * 3, dtype=c) - for c in ["float32", "float64", "int32", "int64"] - } - ) - df.iloc[1, :] = 0 - result = df.dtypes - expected = Series( - [ - np.dtype("float32"), - np.dtype("float64"), - np.dtype("int32"), - np.dtype("int64"), - ], - index=["float32", "float64", "int32", "int64"], - ) - - # when we don't preserve boolean casts - # - # expected = Series({ 'float32' : 1, 'float64' : 3 }) - - tm.assert_series_equal(result, expected) - - # aligning - def _check_align(df, cond, other, check_dtypes=True): - rs = df.where(cond, other) - for i, k in enumerate(rs.columns): - result = rs[k] - d = df[k].values - c = cond[k].reindex(df[k].index).fillna(False).values - - if is_scalar(other): - o = other - else: - if isinstance(other, np.ndarray): - o = Series(other[:, i], index=result.index).values - else: - o = other[k].values - - new_values = d if c.all() else np.where(c, d, o) - expected = Series(new_values, index=result.index, name=k) - - # since we can't always have the correct numpy dtype - # as numpy doesn't know how to downcast, don't check - tm.assert_series_equal(result, expected, check_dtype=False) - - # dtypes - # can't check dtype when other is an ndarray - - if check_dtypes and not isinstance(other, np.ndarray): - assert (rs.dtypes == df.dtypes).all() - - for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - # other is a frame - cond = (df > 0)[1:] - _check_align(df, cond, _safe_add(df)) - - # check other is ndarray - cond = df > 0 - _check_align(df, cond, (_safe_add(df).values)) - - # integers are upcast, so don't check the dtypes - cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) - _check_align(df, cond, np.nan, check_dtypes=check_dtypes) - - # invalid conditions - df = default_frame - err1 = (df + 1).values[0:2, :] - msg = "other must be the same shape as self when an ndarray" - with pytest.raises(ValueError, match=msg): - df.where(cond, err1) - - err2 = cond.iloc[:2, :].values - other1 = _safe_add(df) - msg = "Array conditional must be same shape as self" - with pytest.raises(ValueError, match=msg): - df.where(err2, other1) - - with pytest.raises(ValueError, match=msg): - df.mask(True) - with pytest.raises(ValueError, match=msg): - df.mask(0) - - # where inplace - def _check_set(df, cond, check_dtypes=True): - dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) - expected = dfi.mask(~econd) - - dfi.where(cond, np.nan, inplace=True) - tm.assert_frame_equal(dfi, expected) - - # dtypes (and confirm upcasts)x - if check_dtypes: - for k, v in df.dtypes.items(): - if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype("float64") - assert dfi[k].dtype == v - - for df in [ - default_frame, - float_string_frame, - mixed_float_frame, - mixed_int_frame, - ]: - if df is float_string_frame: - with pytest.raises(TypeError): - df > 0 - continue - - cond = df > 0 - _check_set(df, cond) - - cond = df >= 0 - _check_set(df, cond) - - # aligning - cond = (df >= 0)[1:] - _check_set(df, cond) - - # GH 10218 - # test DataFrame.where with Series slicing - df = DataFrame({"a": range(3), "b": range(4, 7)}) - result = df.where(df["a"] == 1) - expected = df[df["a"] == 1].reindex(df.index) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("klass", [list, tuple, np.array]) - def test_where_array_like(self, klass): - # see gh-15414 - df = DataFrame({"a": [1, 2, 3]}) - cond = [[False], [True], [True]] - expected = DataFrame({"a": [np.nan, 2, 3]}) - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - df["b"] = 2 - expected["b"] = [2, np.nan, 2] - cond = [[False, True], [True, False], [True, True]] - - result = df.where(klass(cond)) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "cond", - [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], - ], - ) - def test_where_invalid_input_single(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - @pytest.mark.parametrize( - "cond", - [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [ - [pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")], - ], - ], - ) - def test_where_invalid_input_multiple(self, cond): - # see gh-15414: only boolean arrays accepted - df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) - msg = "Boolean array expected for the condition" - - with pytest.raises(ValueError, match=msg): - df.where(cond) - - def test_where_dataframe_col_match(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - cond = DataFrame([[True, False, True], [False, False, True]]) - - result = df.where(cond) - expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) - tm.assert_frame_equal(result, expected) - - # this *does* align, though has no matching columns - cond.columns = ["a", "b", "c"] - result = df.where(cond) - expected = DataFrame(np.nan, index=df.index, columns=df.columns) - tm.assert_frame_equal(result, expected) - - def test_where_ndframe_align(self): - msg = "Array conditional must be same shape as self" - df = DataFrame([[1, 2, 3], [4, 5, 6]]) - - cond = [True] - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - cond = np.array([False, True, False, True]) - with pytest.raises(ValueError, match=msg): - df.where(cond) - - expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) - - out = df.where(Series(cond)) - tm.assert_frame_equal(out, expected) - - def test_where_bug(self): - # see gh-2793 - df = DataFrame( - {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" - ) - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_mixed(self, sint_dtype): - # see gh-2793 - df = DataFrame( - { - "a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), - } - ) - - expected = DataFrame( - {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64", - ) - - result = df.where(df > 2, np.nan) - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(result > 2, np.nan, inplace=True) - tm.assert_frame_equal(result, expected) - - def test_where_bug_transposition(self): - # see gh-7506 - a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) - b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - a = DataFrame({0: [4, 6], 1: [1, 0]}) - b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) - do_not_replace = b.isna() | (a > b) - - expected = a.copy() - expected[~do_not_replace] = b - - result = a.where(do_not_replace, b) - tm.assert_frame_equal(result, expected) - - def test_where_datetime(self): - - # GH 3311 - df = DataFrame( - dict( - A=date_range("20130102", periods=5), - B=date_range("20130104", periods=5), - C=np.random.randn(5), - ) - ) - - stamp = datetime(2013, 1, 3) - with pytest.raises(TypeError): - df > stamp - - result = df[df.iloc[:, :-1] > stamp] - - expected = df.copy() - expected.loc[[0, 1], "A"] = np.nan - expected.loc[:, "C"] = np.nan - tm.assert_frame_equal(result, expected) - - def test_where_none(self): - # GH 4667 - # setting with None changes dtype - df = DataFrame({"series": Series(range(10))}).astype(float) - df[df > 7] = None - expected = DataFrame( - {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} - ) - tm.assert_frame_equal(df, expected) - - # GH 7656 - df = DataFrame( - [ - {"A": 1, "B": np.nan, "C": "Test"}, - {"A": np.nan, "B": "Test", "C": np.nan}, - ] - ) - msg = "boolean setting on mixed-type" - - with pytest.raises(TypeError, match=msg): - df.where(~isna(df), None, inplace=True) - - def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): - # see gh-21947 - df = pd.DataFrame(columns=["a"]) - cond = df.applymap(lambda x: x > 0) - - result = df.where(cond) - tm.assert_frame_equal(result, df) - - def test_where_align(self): - def create(): - df = DataFrame(np.random.randn(10, 3)) - df.iloc[3:5, 0] = np.nan - df.iloc[4:6, 1] = np.nan - df.iloc[5:8, 2] = np.nan - return df - - # series - df = create() - expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis="columns") - tm.assert_frame_equal(result, expected) - - df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") - tm.assert_frame_equal(df, expected) - - df = create().fillna(0) - expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis="index") - tm.assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis="rows") - tm.assert_frame_equal(result, expected) - - # frame - df = create() - expected = df.fillna(1) - result = df.where( - pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) - ) - tm.assert_frame_equal(result, expected) - - def test_where_complex(self): - # GH 6345 - expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) - df[df.abs() >= 5] = np.nan - tm.assert_frame_equal(df, expected) - - def test_where_axis(self): - # GH 9736 - df = DataFrame(np.random.randn(2, 2)) - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, 1]) - - expected = DataFrame([[0, 0], [1, 1]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, 1], [0, 1]], dtype="float64") - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype="int64") - mask = DataFrame([[False, False], [False, False]]) - s = Series([0, np.nan]) - - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") - result = df.where(mask, s, axis="index") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis="columns") - tm.assert_frame_equal(result, expected) - - expected = DataFrame( - { - 0: np.array([0, 0], dtype="int64"), - 1: np.array([np.nan, np.nan], dtype="float64"), - } - ) - result = df.copy() - result.where(mask, s, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - # Multiple dtypes (=> multiple Blocks) - df = pd.concat( - [ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), - ], - ignore_index=True, - axis=1, - ) - mask = DataFrame(False, columns=df.columns, index=df.index) - s1 = Series(1, index=df.columns) - s2 = Series(2, index=df.index) - - result = df.where(mask, s1, axis="columns") - expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s1, axis="columns", inplace=True) - tm.assert_frame_equal(result, expected) - - result = df.where(mask, s2, axis="index") - expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype("int64") - expected[3] = expected[3].astype("int64") - tm.assert_frame_equal(result, expected) - - result = df.copy() - result.where(mask, s2, axis="index", inplace=True) - tm.assert_frame_equal(result, expected) - - # DataFrame vs DataFrame - d1 = df.copy().drop(1, axis=0) - expected = df.copy() - expected.loc[1, :] = np.nan - - result = df.where(mask, d1) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d1, axis="index") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d1, inplace=True, axis="index") - tm.assert_frame_equal(result, expected) - - d2 = df.copy().drop(1, axis=1) - expected = df.copy() - expected.loc[:, 1] = np.nan - - result = df.where(mask, d2) - tm.assert_frame_equal(result, expected) - result = df.where(mask, d2, axis="columns") - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True) - tm.assert_frame_equal(result, expected) - result = df.copy() - result.where(mask, d2, inplace=True, axis="columns") - tm.assert_frame_equal(result, expected) - - def test_where_callable(self): - # GH 12533 - df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - result = df.where(lambda x: x > 4, lambda x: x + 1) - exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df > 4, df + 1)) - - # return ndarray and scalar - result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) - exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) - - # chain - result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) - exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) - tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) - - def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), - columns=["date"], - ) - df2 = DataFrame( - DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - mask = DataFrame([True, True, False], columns=["date"]) - exp = DataFrame( - DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), - columns=["date"], - ) - result = df1.where(mask, df2) - tm.assert_frame_equal(exp, result) - def test_mask(self): df = DataFrame(np.random.randn(5, 3)) cond = df > 0 @@ -3402,65 +2219,6 @@ def test_interval_index(self): tm.assert_series_equal(result, expected) -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype, "M8[ns, US/Eastern" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - - def test_transpose(self, timezone_frame): - - result = timezone_frame.T - expected = DataFrame(timezone_frame.values.T) - expected.index = ["A", "B", "C"] - tm.assert_frame_equal(result, expected) - - def test_scalar_assignment(self): - # issue #19843 - df = pd.DataFrame(index=(0, 1, 2)) - df["now"] = pd.Timestamp("20130101", tz="UTC") - expected = pd.DataFrame( - {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] - ) - tm.assert_frame_equal(df, expected) - - class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): @@ -3509,350 +2267,3 @@ def test_transpose(self, uint64_frame): expected = DataFrame(uint64_frame.values.T) expected.index = ["A", "B"] tm.assert_frame_equal(result, expected) - - -class TestDataFrameIndexingCategorical: - def test_assignment(self): - # assignment - df = DataFrame( - {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} - ) - labels = Categorical( - ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - ) - - df = df.sort_values(by=["value"], ascending=True) - s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) - d = s.values - df["D"] = d - str(df) - - result = df.dtypes - expected = Series( - [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], - index=["value", "D"], - ) - tm.assert_series_equal(result, expected) - - df["E"] = s - str(df) - - result = df.dtypes - expected = Series( - [ - np.dtype("int32"), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False), - ], - index=["value", "D", "E"], - ) - tm.assert_series_equal(result, expected) - - result1 = df["D"] - result2 = df["E"] - tm.assert_categorical_equal(result1._data._block.values, d) - - # sorting - s.name = "E" - tm.assert_series_equal(result2.sort_index(), s.sort_index()) - - cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) - df = DataFrame(Series(cat)) - - def test_assigning_ops(self): - # systematically test the assigning operations: - # for all slicing ops: - # for value in categories and value not in categories: - - # - assign a single value -> exp_single_cats_value - - # - assign a complete row (mixed values) -> exp_single_row - - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) - - # the expected values - # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - - # changed part of the cats column - cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) - - # changed single value in cats col - cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame( - {"cats": cats4, "values": values4}, index=idx4 - ) - - # iloc - # ############### - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.iloc[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.iloc[df.index == "j", 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, 0] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.iloc[2, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2, :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.iloc[2:4, 0] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.iloc[2:4, 0] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", "cats"] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", "cats"] = ["c", "c"] - - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", df.columns[0]] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError): - # different values - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError): - df.loc["j":"k", df.columns[0]] = ["c", "c"] - - # iat - df = orig.copy() - df.iat[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.iat[2, 0] = "c" - - # at - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # fancy indexing - catsf = Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] - ) - idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) - valuesf = [1, 1, 3, 3, 1, 1, 1] - df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) - - exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) - - df[df["cats"] == "c"] = ["b", 2] - # category c is kept in .categories - tm.assert_frame_equal(df, exp_fancy) - - # set_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - with pytest.raises(ValueError): - df = orig.copy() - df.at["j", "cats"] = "c" - - # Assigning a Category to parts of a int/... column uses the values of - # the Categorical - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) - exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) - df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) - df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp) - - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py new file mode 100644 index 0000000000000..df1b128dcd227 --- /dev/null +++ b/pandas/tests/frame/indexing/test_where.py @@ -0,0 +1,582 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna +import pandas._testing as tm + + +class TestDataFrameIndexingWhere: + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) + + def _safe_add(df): + # only add to the numeric items + def is_ok(s): + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) + + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) + + def _check_get(df, cond, check_dtypes=True): + other1 = _safe_add(df) + rs = df.where(cond, other1) + rs2 = df.where(cond.values, other1) + for k, v in rs.items(): + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) + tm.assert_series_equal(v, exp, check_names=False) + tm.assert_frame_equal(rs, rs2) + + # dtypes + if check_dtypes: + assert (rs.dtypes == df.dtypes).all() + + # check getting + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + cond = df > 0 + _check_get(df, cond) + + # upcasting case (GH # 2794) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) + df.iloc[1, :] = 0 + result = df.dtypes + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) + + # when we don't preserve boolean casts + # + # expected = Series({ 'float32' : 1, 'float64' : 3 }) + + tm.assert_series_equal(result, expected) + + # aligning + def _check_align(df, cond, other, check_dtypes=True): + rs = df.where(cond, other) + for i, k in enumerate(rs.columns): + result = rs[k] + d = df[k].values + c = cond[k].reindex(df[k].index).fillna(False).values + + if is_scalar(other): + o = other + else: + if isinstance(other, np.ndarray): + o = Series(other[:, i], index=result.index).values + else: + o = other[k].values + + new_values = d if c.all() else np.where(c, d, o) + expected = Series(new_values, index=result.index, name=k) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + tm.assert_series_equal(result, expected, check_dtype=False) + + # dtypes + # can't check dtype when other is an ndarray + + if check_dtypes and not isinstance(other, np.ndarray): + assert (rs.dtypes == df.dtypes).all() + + for df in [float_string_frame, mixed_float_frame, mixed_int_frame]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + # other is a frame + cond = (df > 0)[1:] + _check_align(df, cond, _safe_add(df)) + + # check other is ndarray + cond = df > 0 + _check_align(df, cond, (_safe_add(df).values)) + + # integers are upcast, so don't check the dtypes + cond = df > 0 + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) + _check_align(df, cond, np.nan, check_dtypes=check_dtypes) + + # invalid conditions + df = default_frame + err1 = (df + 1).values[0:2, :] + msg = "other must be the same shape as self when an ndarray" + with pytest.raises(ValueError, match=msg): + df.where(cond, err1) + + err2 = cond.iloc[:2, :].values + other1 = _safe_add(df) + msg = "Array conditional must be same shape as self" + with pytest.raises(ValueError, match=msg): + df.where(err2, other1) + + with pytest.raises(ValueError, match=msg): + df.mask(True) + with pytest.raises(ValueError, match=msg): + df.mask(0) + + # where inplace + def _check_set(df, cond, check_dtypes=True): + dfi = df.copy() + econd = cond.reindex_like(df).fillna(True) + expected = dfi.mask(~econd) + + dfi.where(cond, np.nan, inplace=True) + tm.assert_frame_equal(dfi, expected) + + # dtypes (and confirm upcasts)x + if check_dtypes: + for k, v in df.dtypes.items(): + if issubclass(v.type, np.integer) and not cond[k].all(): + v = np.dtype("float64") + assert dfi[k].dtype == v + + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: + if df is float_string_frame: + with pytest.raises(TypeError): + df > 0 + continue + + cond = df > 0 + _check_set(df, cond) + + cond = df >= 0 + _check_set(df, cond) + + # aligning + cond = (df >= 0)[1:] + _check_set(df, cond) + + # GH 10218 + # test DataFrame.where with Series slicing + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("klass", [list, tuple, np.array]) + def test_where_array_like(self, klass): + # see gh-15414 + df = DataFrame({"a": [1, 2, 3]}) + cond = [[False], [True], [True]] + expected = DataFrame({"a": [np.nan, 2, 3]}) + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + df["b"] = 2 + expected["b"] = [2, np.nan, 2] + cond = [[False, True], [True, False], [True, True]] + + result = df.where(klass(cond)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) + def test_where_invalid_input_single(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) + def test_where_invalid_input_multiple(self, cond): + # see gh-15414: only boolean arrays accepted + df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) + msg = "Boolean array expected for the condition" + + with pytest.raises(ValueError, match=msg): + df.where(cond) + + def test_where_dataframe_col_match(self): + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + cond = DataFrame([[True, False, True], [False, False, True]]) + + result = df.where(cond) + expected = DataFrame([[1.0, np.nan, 3], [np.nan, np.nan, 6]]) + tm.assert_frame_equal(result, expected) + + # this *does* align, though has no matching columns + cond.columns = ["a", "b", "c"] + result = df.where(cond) + expected = DataFrame(np.nan, index=df.index, columns=df.columns) + tm.assert_frame_equal(result, expected) + + def test_where_ndframe_align(self): + msg = "Array conditional must be same shape as self" + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + + cond = [True] + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[1, 2, 3], [np.nan, np.nan, np.nan]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + cond = np.array([False, True, False, True]) + with pytest.raises(ValueError, match=msg): + df.where(cond) + + expected = DataFrame([[np.nan, np.nan, np.nan], [4, 5, 6]]) + + out = df.where(Series(cond)) + tm.assert_frame_equal(out, expected) + + def test_where_bug(self): + # see gh-2793 + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_mixed(self, sint_dtype): + # see gh-2793 + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) + + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) + + result = df.where(df > 2, np.nan) + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(result > 2, np.nan, inplace=True) + tm.assert_frame_equal(result, expected) + + def test_where_bug_transposition(self): + # see gh-7506 + a = DataFrame({0: [1, 2], 1: [3, 4], 2: [5, 6]}) + b = DataFrame({0: [np.nan, 8], 1: [9, np.nan], 2: [np.nan, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + a = DataFrame({0: [4, 6], 1: [1, 0]}) + b = DataFrame({0: [np.nan, 3], 1: [3, np.nan]}) + do_not_replace = b.isna() | (a > b) + + expected = a.copy() + expected[~do_not_replace] = b + + result = a.where(do_not_replace, b) + tm.assert_frame_equal(result, expected) + + def test_where_datetime(self): + + # GH 3311 + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) + + stamp = datetime(2013, 1, 3) + with pytest.raises(TypeError): + df > stamp + + result = df[df.iloc[:, :-1] > stamp] + + expected = df.copy() + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan + tm.assert_frame_equal(result, expected) + + def test_where_none(self): + # GH 4667 + # setting with None changes dtype + df = DataFrame({"series": Series(range(10))}).astype(float) + df[df > 7] = None + expected = DataFrame( + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) + tm.assert_frame_equal(df, expected) + + # GH 7656 + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" + + with pytest.raises(TypeError, match=msg): + df.where(~isna(df), None, inplace=True) + + def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): + # see gh-21947 + df = pd.DataFrame(columns=["a"]) + cond = df.applymap(lambda x: x > 0) + + result = df.where(cond) + tm.assert_frame_equal(result, df) + + def test_where_align(self): + def create(): + df = DataFrame(np.random.randn(10, 3)) + df.iloc[3:5, 0] = np.nan + df.iloc[4:6, 1] = np.nan + df.iloc[5:8, 2] = np.nan + return df + + # series + df = create() + expected = df.fillna(df.mean()) + result = df.where(pd.notna(df), df.mean(), axis="columns") + tm.assert_frame_equal(result, expected) + + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + tm.assert_frame_equal(df, expected) + + df = create().fillna(0) + expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) + result = df.where(df > 0, df[0], axis="index") + tm.assert_frame_equal(result, expected) + result = df.where(df > 0, df[0], axis="rows") + tm.assert_frame_equal(result, expected) + + # frame + df = create() + expected = df.fillna(1) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal(result, expected) + + def test_where_complex(self): + # GH 6345 + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) + df[df.abs() >= 5] = np.nan + tm.assert_frame_equal(df, expected) + + def test_where_axis(self): + # GH 9736 + df = DataFrame(np.random.randn(2, 2)) + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, 1]) + + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Upcast needed + df = DataFrame([[1, 2], [3, 4]], dtype="int64") + mask = DataFrame([[False, False], [False, False]]) + s = Series([0, np.nan]) + + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0, np.nan], [0, np.nan]]) + result = df.where(mask, s, axis="columns") + tm.assert_frame_equal(result, expected) + + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) + result = df.copy() + result.where(mask, s, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + # Multiple dtypes (=> multiple Blocks) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) + mask = DataFrame(False, columns=df.columns, index=df.index) + s1 = Series(1, index=df.columns) + s2 = Series(2, index=df.index) + + result = df.where(mask, s1, axis="columns") + expected = DataFrame(1.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s1, axis="columns", inplace=True) + tm.assert_frame_equal(result, expected) + + result = df.where(mask, s2, axis="index") + expected = DataFrame(2.0, columns=df.columns, index=df.index) + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") + tm.assert_frame_equal(result, expected) + + result = df.copy() + result.where(mask, s2, axis="index", inplace=True) + tm.assert_frame_equal(result, expected) + + # DataFrame vs DataFrame + d1 = df.copy().drop(1, axis=0) + expected = df.copy() + expected.loc[1, :] = np.nan + + result = df.where(mask, d1) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d1, axis="index") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d1, inplace=True, axis="index") + tm.assert_frame_equal(result, expected) + + d2 = df.copy().drop(1, axis=1) + expected = df.copy() + expected.loc[:, 1] = np.nan + + result = df.where(mask, d2) + tm.assert_frame_equal(result, expected) + result = df.where(mask, d2, axis="columns") + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True) + tm.assert_frame_equal(result, expected) + result = df.copy() + result.where(mask, d2, inplace=True, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_where_callable(self): + # GH 12533 + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + result = df.where(lambda x: x > 4, lambda x: x + 1) + exp = DataFrame([[2, 3, 4], [5, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df > 4, df + 1)) + + # return ndarray and scalar + result = df.where(lambda x: (x % 2 == 0).values, lambda x: 99) + exp = DataFrame([[99, 2, 99], [4, 99, 6], [99, 8, 99]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, df.where(df % 2 == 0, 99)) + + # chain + result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) + exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) + tm.assert_frame_equal(result, exp) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) + + def test_where_tz_values(self, tz_naive_fixture): + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + result = df1.where(mask, df2) + tm.assert_frame_equal(exp, result) diff --git a/pandas/tests/frame/methods/__init__.py b/pandas/tests/frame/methods/__init__.py new file mode 100644 index 0000000000000..245594bfdc9e7 --- /dev/null +++ b/pandas/tests/frame/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) DataFrame methods + +Ideally these files/tests should correspond 1-to-1 with tests.series.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py new file mode 100644 index 0000000000000..d128a51f4b390 --- /dev/null +++ b/pandas/tests/frame/methods/test_append.py @@ -0,0 +1,195 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameAppend: + def test_append_empty_list(self): + # GH 28769 + df = DataFrame() + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df + + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df # .append() should return a new object + + def test_append_series_dict(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + series = df.loc[4] + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + df.append(series, verify_integrity=True) + + series.name = None + msg = "Can only append a Series if ignore_index=True" + with pytest.raises(TypeError, match=msg): + df.append(series, verify_integrity=True) + + result = df.append(series[::-1], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True + ) + tm.assert_frame_equal(result, expected) + + # dict + result = df.append(series.to_dict(), ignore_index=True) + tm.assert_frame_equal(result, expected) + + result = df.append(series[::-1][:3], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True + ) + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + # can append when name set + row = df.loc[4] + row.name = 5 + result = df.append(row) + expected = df.append(df[-1:], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_append_list_of_series_dicts(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [x.to_dict() for idx, x in df.iterrows()] + + result = df.append(dicts, ignore_index=True) + expected = df.append(df, ignore_index=True) + tm.assert_frame_equal(result, expected) + + # different columns + dicts = [ + {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, + {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, + ] + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_missing_cols(self): + # GH22252 + # exercise the conditional branch in append method where the data + # to be appended is a list and does not contain all columns that are in + # the target DataFrame + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [{"foo": 9}, {"bar": 10}] + with tm.assert_produces_warning(None): + result = df.append(dicts, ignore_index=True, sort=True) + + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_empty_dataframe(self): + + # Empty df append empty df + df1 = DataFrame() + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-empty df append empty df + df1 = DataFrame(np.random.randn(5, 2)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Empty df with columns append empty df + df1 = DataFrame(columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-Empty df with columns append empty df + df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + def test_append_dtypes(self): + + # GH 5754 + # row appends of different dtypes (so need to do by-item) + # can sometimes infer the correct type + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": np.nan}, index=range(1)) + df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] + ) + def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): + # GH 30238 + tz = tz_naive_fixture + df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, dtype", + [ + ([1], pd.Int64Dtype()), + ([1], pd.CategoricalDtype()), + ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), + ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), + ([1], pd.SparseDtype()), + ], + ) + def test_other_dtypes(self, data, dtype): + df = pd.DataFrame(data, dtype=dtype) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(data, name=0, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/methods/test_asof.py similarity index 90% rename from pandas/tests/frame/test_asof.py rename to pandas/tests/frame/methods/test_asof.py index 9a7d806c79dc3..0291be0a4083e 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from pandas import DataFrame, Series, Timestamp, date_range, to_datetime -import pandas.util.testing as tm +from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +import pandas._testing as tm @pytest.fixture @@ -30,6 +30,7 @@ def test_basic(self, date_range_frame): ub = df.index[30] dates = list(dates) + result = df.asof(dates) assert result.notna().all(1).all() @@ -65,9 +66,12 @@ def test_missing(self, date_range_frame): # no match found - `where` value before earliest date in index N = 10 df = date_range_frame.iloc[:N].copy() + result = df.asof("1989-12-31") - expected = Series(index=["A", "B"], name=Timestamp("1989-12-31")) + expected = Series( + index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64 + ) tm.assert_series_equal(result, expected) result = df.asof(to_datetime(["1989-12-31"])) @@ -76,6 +80,12 @@ def test_missing(self, date_range_frame): ) tm.assert_frame_equal(result, expected) + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans @@ -130,5 +140,6 @@ def test_time_zone_aware_index(self, stamp, expected): Timestamp("2018-01-01 22:35:10.550+00:00"), ], ) + result = df.asof(stamp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py new file mode 100644 index 0000000000000..34727da3b95ae --- /dev/null +++ b/pandas/tests/frame/methods/test_clip.py @@ -0,0 +1,157 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameClip: + def test_clip(self, float_frame): + median = float_frame.median().median() + original = float_frame.copy() + + double = float_frame.clip(upper=median, lower=median) + assert not (double.values != median).any() + + # Verify that float_frame was not changed inplace + assert (float_frame.values == original.values).all() + + def test_inplace_clip(self, float_frame): + # GH#15388 + median = float_frame.median().median() + frame_copy = float_frame.copy() + + frame_copy.clip(upper=median, lower=median, inplace=True) + assert not (frame_copy.values != median).any() + + def test_dataframe_clip(self): + # GH#2747 + df = DataFrame(np.random.randn(1000, 2)) + + for lb, ub in [(-1, 1), (1, -1)]: + clipped_df = df.clip(lb, ub) + + lb, ub = min(lb, ub), max(ub, lb) + lb_mask = df.values <= lb + ub_mask = df.values >= ub + mask = ~lb_mask & ~ub_mask + assert (clipped_df.values[lb_mask] == lb).all() + assert (clipped_df.values[ub_mask] == ub).all() + assert (clipped_df.values[mask] == df.values[mask]).all() + + def test_clip_mixed_numeric(self): + # TODO(jreback) + # clip on mixed integer or floats + # with integer clippers coerces to float + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) + result = df.clip(1, 2) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) + tm.assert_frame_equal(result, expected, check_like=True) + + # GH#24162, clipping now preserves numeric types per column + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) + expected = df.dtypes + result = df.clip(upper=3).dtypes + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + def test_clip_against_series(self, inplace): + # GH#6966 + + df = DataFrame(np.random.randn(1000, 2)) + lb = Series(np.random.randn(1000)) + ub = lb + 1 + + original = df.copy() + clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) + + if inplace: + clipped_df = df + + for i in range(2): + lb_mask = original.iloc[:, i] <= lb + ub_mask = original.iloc[:, i] >= ub + mask = ~lb_mask & ~ub_mask + + result = clipped_df.loc[lb_mask, i] + tm.assert_series_equal(result, lb[lb_mask], check_names=False) + assert result.name == i + + result = clipped_df.loc[ub_mask, i] + tm.assert_series_equal(result, ub[ub_mask], check_names=False) + assert result.name == i + + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): + # GH#15390 + original = simple_frame.copy(deep=True) + + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) + + expected = pd.DataFrame(res, columns=original.columns, index=original.index) + if inplace: + result = original + tm.assert_frame_equal(result, expected, check_exact=True) + + @pytest.mark.parametrize("axis", [0, 1, None]) + def test_clip_against_frame(self, axis): + df = DataFrame(np.random.randn(1000, 2)) + lb = DataFrame(np.random.randn(1000, 2)) + ub = lb + 1 + + clipped_df = df.clip(lb, ub, axis=axis) + + lb_mask = df <= lb + ub_mask = df >= ub + mask = ~lb_mask & ~ub_mask + + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) + + def test_clip_against_unordered_columns(self): + # GH#20911 + df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) + df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) + result_upper = df1.clip(lower=0, upper=df2) + expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) + result_lower = df1.clip(lower=df3, upper=3) + expected_lower = df1.clip(lower=df3[df1.columns], upper=3) + result_lower_upper = df1.clip(lower=df3, upper=df2) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) + tm.assert_frame_equal(result_upper, expected_upper) + tm.assert_frame_equal(result_lower, expected_lower) + tm.assert_frame_equal(result_lower_upper, expected_lower_upper) + + def test_clip_with_na_args(self, float_frame): + """Should process np.nan argument as None """ + # GH#17276 + tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) + + # GH#19992 + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame( + {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py new file mode 100644 index 0000000000000..13a93e3efc48c --- /dev/null +++ b/pandas/tests/frame/methods/test_count.py @@ -0,0 +1,36 @@ +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameCount: + def test_count(self): + # corner case + frame = DataFrame() + ct1 = frame.count(1) + assert isinstance(ct1, Series) + + ct2 = frame.count(0) + assert isinstance(ct2, Series) + + # GH#423 + df = DataFrame(index=range(10)) + result = df.count(1) + expected = Series(0, index=df.index) + tm.assert_series_equal(result, expected) + + df = DataFrame(columns=range(10)) + result = df.count(0) + expected = Series(0, index=df.columns) + tm.assert_series_equal(result, expected) + + df = DataFrame() + result = df.count() + expected = Series(0, index=[]) + tm.assert_series_equal(result, expected) + + def test_count_objects(self, float_string_frame): + dm = DataFrame(float_string_frame._series) + df = DataFrame(float_string_frame._series) + + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py new file mode 100644 index 0000000000000..5c13b60aae0d0 --- /dev/null +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -0,0 +1,272 @@ +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Series, isna +import pandas._testing as tm + + +class TestDataFrameCov: + def test_cov(self, float_frame, float_string_frame): + # min_periods no NAs (corner case) + expected = float_frame.cov() + result = float_frame.cov(min_periods=len(float_frame)) + + tm.assert_frame_equal(expected, result) + + result = float_frame.cov(min_periods=len(float_frame) + 1) + assert isna(result.values).all() + + # with NAs + frame = float_frame.copy() + frame["A"][:5] = np.nan + frame["B"][5:10] = np.nan + result = float_frame.cov(min_periods=len(float_frame) - 8) + expected = float_frame.cov() + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan + + # regular + float_frame["A"][:5] = np.nan + float_frame["B"][:10] = np.nan + cov = float_frame.cov() + + tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) + + # exclude non-numeric types + result = float_string_frame.cov() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() + tm.assert_frame_equal(result, expected) + + # Single column frame + df = DataFrame(np.linspace(0.0, 1.0, 10)) + result = df.cov() + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) + tm.assert_frame_equal(result, expected) + df.loc[0] = np.nan + result = df.cov() + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) + tm.assert_frame_equal(result, expected) + + +class TestDataFrameCorr: + # DataFrame.corr(), as opposed to DataFrame.corrwith + + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) + @td.skip_if_no_scipy + def test_corr_scipy_method(self, float_frame, method): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + correls = float_frame.corr(method=method) + expected = float_frame["A"].corr(float_frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) + + # --------------------------------------------------------------------- + + @td.skip_if_no_scipy + def test_corr_non_numeric(self, float_frame, float_string_frame): + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan + + # exclude non-numeric types + result = float_string_frame.corr() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() + tm.assert_frame_equal(result, expected) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + def test_corr_nooverlap(self, meth): + # nothing in common + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + rs = df.corr(meth) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) + + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) + def test_corr_constant(self, meth): + # constant --> all NA + + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) + rs = df.corr(meth) + assert isna(rs.values).all() + + @td.skip_if_no_scipy + def test_corr_int_and_boolean(self): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [True, False], "b": [1, 0]}) + + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + for meth in ["pearson", "kendall", "spearman"]: + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + result = df.corr(meth) + tm.assert_frame_equal(result, expected) + + def test_corr_cov_independent_index_column(self): + # GH#14617 + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + for method in ["cov", "corr"]: + result = getattr(df, method)() + assert result.index is not result.columns + assert result.index.equals(result.columns) + + def test_corr_invalid_method(self): + # GH#22298 + df = pd.DataFrame(np.random.normal(size=(10, 2))) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + df.corr(method="____") + + def test_corr_int(self): + # dtypes other than float64 GH#1761 + df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + + df3.cov() + df3.corr() + + +class TestDataFrameCorrWith: + def test_corrwith(self, datetime_frame): + a = datetime_frame + noise = Series(np.random.randn(len(a)), index=a.index) + + b = datetime_frame.add(noise, axis=0) + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b["B"] + + colcorr = a.corrwith(b, axis=0) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) + + rowcorr = a.corrwith(b, axis=1) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped + + dropped = a.corrwith(b, axis=1, drop=True) + assert a.index[-1] not in dropped.index + + # non time-series data + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] + df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) + df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) + + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ["A", "B", "C", "D"] + + df1["obj"] = "foo" + df2["obj"] = "bar" + + result = df1.corrwith(df2) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) + tm.assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) + tm.assert_series_equal(result, expected) + + def test_corrwith_series(self, datetime_frame): + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) + + tm.assert_series_equal(result, expected) + + def test_corrwith_matches_corrcoef(self): + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] + + tm.assert_almost_equal(c1, c2) + assert c1 < 1 + + def test_corrwith_mixed_dtypes(self): + # GH#18570 + df = pd.DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) + s = pd.Series([0, 6, 7, 3]) + result = df.corrwith(s) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = pd.Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + + def test_corrwith_index_intersection(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=True).index.sort_values() + expected = df1.columns.intersection(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_index_union(self): + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) + + result = df1.corrwith(df2, drop=False).index.sort_values() + expected = df1.columns.union(df2.columns).sort_values() + tm.assert_index_equal(result, expected) + + def test_corrwith_dup_cols(self): + # GH#21925 + df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_spearman(self): + # GH#21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_kendall(self): + # GH#21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df ** 2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py new file mode 100644 index 0000000000000..251563e51e15a --- /dev/null +++ b/pandas/tests/frame/methods/test_describe.py @@ -0,0 +1,333 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameDescribe: + def test_describe_bool_in_mixed_frame(self): + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) + + # Integer data are included in .describe() output, + # Boolean and string data are not. + result = df.describe() + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + # Top value is a boolean value that is False + result = df.describe(include=["bool"]) + + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) + tm.assert_frame_equal(result, expected) + + def test_describe_empty_object(self): + # GH#27183 + df = pd.DataFrame({"A": [None, None]}, dtype=object) + result = df.describe() + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + result = df.iloc[:0].describe() + tm.assert_frame_equal(result, expected) + + def test_describe_bool_frame(self): + # GH#13891 + df = pd.DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) + result = df.describe() + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) + result = df.describe() + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) + result = df.describe() + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_categorical(self): + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) + cat = df + + # Categoricals should not show up together with numerical columns + result = cat.describe() + assert len(result.columns) == 1 + + # In a frame, describe() for the cat should be the same as for string + # arrays (count, unique, top, freq) + + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) + s = Series(cat) + result = s.describe() + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) + tm.assert_series_equal(result, expected) + + cat = Series(Categorical(["a", "b", "c", "c"])) + df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) + result = df3.describe() + tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + + def test_describe_empty_categorical_column(self): + # GH#26397 + # Ensure the index of an an empty categorical DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) + tm.assert_frame_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2, 0]) + assert np.isnan(result.iloc[3, 0]) + + def test_describe_categorical_columns(self): + # GH#11558 + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) + result = df.describe() + + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) + + tm.assert_frame_equal(result, expected) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) + + def test_describe_datetime_columns(self): + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) + df.columns = columns + result = df.describe() + + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + expected.columns = exp_columns + tm.assert_frame_equal(result, expected) + assert result.columns.freq == "MS" + assert result.columns.tz == expected.columns.tz + + def test_describe_timedelta_values(self): + # GH#6145 + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + + result = df.describe() + tm.assert_frame_equal(result, expected) + + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) + assert repr(result) == exp_repr + + def test_describe_tz_values(self, tz_naive_fixture): + # GH#21332 + tz = tz_naive_fixture + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [ + 5, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2, + 1.581139, + 0, + 1, + 2, + 3, + 4, + ], + "s2": [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + }, + index=[ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ], + ) + result = df.describe(include="all") + tm.assert_frame_equal(result, expected) + + def test_describe_percentiles_integer_idx(self): + # GH#26660 + df = pd.DataFrame({"x": [1]}) + pct = np.linspace(0, 1, 10 + 1) + result = df.describe(percentiles=pct) + + expected = DataFrame( + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py new file mode 100644 index 0000000000000..43c25f4c05c2d --- /dev/null +++ b/pandas/tests/frame/methods/test_diff.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameDiff: + def test_diff(self, datetime_frame): + the_diff = datetime_frame.diff(1) + + tm.assert_series_equal( + the_diff["A"], datetime_frame["A"] - datetime_frame["A"].shift(1) + ) + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = DataFrame({"s": s}).diff() + assert rs.s[1] == 1 + + # mixed numeric + tf = datetime_frame.astype("float32") + the_diff = tf.diff(1) + tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) + + # GH#10907 + df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) + df.insert(0, "x", 1) + result = df.diff(axis=1) + expected = pd.DataFrame( + {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} + ).astype("float64") + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + + result = df.diff(axis=0) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis1(self, tz): + # GH#18578 + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) + if tz is None: + result = df.diff(axis=1) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(NotImplementedError): + result = df.diff(axis=1) + + def test_diff_timedelta(self): + # GH#4533 + df = DataFrame( + dict( + time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + value=[1.0, 2.0], + ) + ) + + res = df.diff() + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) + tm.assert_frame_equal(res, exp) + + def test_diff_mixed_dtype(self): + df = DataFrame(np.random.randn(5, 3)) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) + + result = df.diff() + assert result[0].dtype == np.float64 + + def test_diff_neg_n(self, datetime_frame): + rs = datetime_frame.diff(-1) + xp = datetime_frame - datetime_frame.shift(-1) + tm.assert_frame_equal(rs, xp) + + def test_diff_float_n(self, datetime_frame): + rs = datetime_frame.diff(1.0) + xp = datetime_frame.diff(1) + tm.assert_frame_equal(rs, xp) + + def test_diff_axis(self): + # GH#9727 + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + tm.assert_frame_equal( + df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]) + ) + tm.assert_frame_equal( + df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) + ) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py similarity index 81% rename from pandas/tests/frame/test_duplicates.py rename to pandas/tests/frame/methods/test_drop_duplicates.py index d2a1fc43d2046..fd4bae26ade57 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -3,95 +3,20 @@ import numpy as np import pytest -from pandas import DataFrame, Series -import pandas.util.testing as tm +from pandas import DataFrame +import pandas._testing as tm @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) -def test_duplicated_with_misspelled_column_name(subset): +def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) msg = re.escape("Index(['a'], dtype='object')") - with pytest.raises(KeyError, match=msg): - df.duplicated(subset) - with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) -@pytest.mark.slow -def test_duplicated_do_not_fail_on_wide_dataframes(): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = { - "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) - } - df = DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool Series as a result and don't fail during - # calculation. Actual values doesn't matter here, though usually it's all - # False in this case - assert isinstance(result, Series) - assert result.dtype == np.bool - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_keep(keep, expected): - df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) -def test_duplicated_subset(subset, keep): - df = DataFrame( - { - "A": [0, 1, 1, 2, 0], - "B": ["a", "b", "b", "c", "a"], - "C": [np.nan, 3, 3, None, np.nan], - } - ) - - if subset is None: - subset = list(df.columns) - elif isinstance(subset, str): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - - expected = df[subset].duplicated(keep=keep) - result = df.duplicated(keep=keep, subset=subset) - tm.assert_series_equal(result, expected) - - def test_drop_duplicates(): df = DataFrame( { @@ -188,17 +113,6 @@ def test_drop_duplicates(): assert df.duplicated(keep=keep).sum() == 0 -def test_duplicated_on_empty_frame(): - # GH 25184 - - df = DataFrame(columns=["a", "b"]) - dupes = df.duplicated("a") - - result = df[dupes] - expected = df.copy() - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_with_duplicate_column_names(): # GH17836 df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) @@ -477,3 +391,30 @@ def test_drop_duplicates_inplace(): expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + inplace, origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(origin_dict)) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000000000..72eec8753315c --- /dev/null +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,100 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } + df = DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/methods/test_explode.py similarity index 99% rename from pandas/tests/frame/test_explode.py rename to pandas/tests/frame/methods/test_explode.py index 545a4b5f9421e..76c87ed355492 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_error(): diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py new file mode 100644 index 0000000000000..0eb94afc99d94 --- /dev/null +++ b/pandas/tests/frame/methods/test_isin.py @@ -0,0 +1,186 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm + + +class TestDataFrameIsIn: + def test_isin(self): + # GH#4211 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] + + result = df.isin(other) + expected = DataFrame([df.loc[s].isin(other) for s in df.index]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # GH#16991 + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + expected = DataFrame(False, df.index, df.columns) + + result = df.isin(empty) + tm.assert_frame_equal(result, expected) + + def test_isin_dict(self): + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} + + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + # non unique columns + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] + expected = DataFrame(False, df.index, df.columns) + expected.loc[0, "A"] = True + result = df.isin(d) + tm.assert_frame_equal(result, expected) + + def test_isin_with_string_scalar(self): + # GH#4763 + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + with pytest.raises(TypeError): + df.isin("a") + + with pytest.raises(TypeError): + df.isin("aaa") + + def test_isin_df(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) + expected = DataFrame(False, df1.index, df1.columns) + result = df1.isin(df2) + expected["A"].loc[[1, 3]] = True + expected["B"].loc[[0, 2]] = True + tm.assert_frame_equal(result, expected) + + # partial overlapping columns + df2.columns = ["A", "C"] + result = df1.isin(df2) + expected["B"] = False + tm.assert_frame_equal(result, expected) + + def test_isin_tuples(self): + # GH#16394 + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) + + def test_isin_df_dupe_values(self): + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + # just cols duped + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) + with pytest.raises(ValueError): + df1.isin(df2) + + # just index duped + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) + with pytest.raises(ValueError): + df1.isin(df2) + + # cols and index: + df2.columns = ["B", "B"] + with pytest.raises(ValueError): + df1.isin(df2) + + def test_isin_dupe_self(self): + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) + result = df.isin(other) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected.loc[0] = True + expected.iloc[1, 1] = True + tm.assert_frame_equal(result, expected) + + def test_isin_against_series(self): + df = pd.DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) + expected = DataFrame(False, index=df.index, columns=df.columns) + expected["A"].loc["a"] = True + expected.loc["d"] = True + result = df.isin(s) + tm.assert_frame_equal(result, expected) + + def test_isin_multiIndex(self): + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) + # against regular index + expected = DataFrame(False, index=df1.index, columns=df1.columns) + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + df2.index = idx + expected = df2.values.astype(np.bool) + expected[:, 1] = ~expected[:, 1] + expected = DataFrame(expected, columns=["A", "B"], index=idx) + + result = df1.isin(df2) + tm.assert_frame_equal(result, expected) + + def test_isin_empty_datetimelike(self): + # GH#15473 + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) + df3 = DataFrame() + + expected = DataFrame({"date": [False, False]}) + + result = df1_ts.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_ts.isin(df3) + tm.assert_frame_equal(result, expected) + + result = df1_td.isin(df2) + tm.assert_frame_equal(result, expected) + result = df1_td.isin(df3) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py new file mode 100644 index 0000000000000..4ce474230b686 --- /dev/null +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -0,0 +1,211 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from string import ascii_lowercase + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.fixture +def df_duplicates(): + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) + + +@pytest.fixture +def df_strings(): + return pd.DataFrame( + { + "a": np.random.permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.permutation(10).astype("float64"), + } + ) + + +@pytest.fixture +def df_main_dtypes(): + return pd.DataFrame( + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + +class TestNLargestNSmallest: + + # ---------------------------------------------------------------------- + # Top / bottom + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) + def test_nlargest_n(self, df_strings, nselect_method, n, order): + # GH#10393 + df = df_strings + if "b" in order: + + error_msg = ( + f"Column 'b' has dtype object, " + f"cannot use method '{nselect_method}' with this dtype" + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(n, order) + else: + ascending = nselect_method == "nsmallest" + result = getattr(df, nselect_method)(n, order) + expected = df.sort_values(order, ascending=ascending).head(n) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) + def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): + df = df_main_dtypes + col = columns[1] + error_msg = ( + f"Column '{col}' has dtype {df[col].dtype}, " + f"cannot use method '{nselect_method}' with this dtype" + ) + # escape some characters that may be in the repr + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) + with pytest.raises(TypeError, match=error_msg): + getattr(df, nselect_method)(2, columns) + + def test_nlargest_all_dtypes(self, df_main_dtypes): + df = df_main_dtypes + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + def test_nlargest_duplicates_on_starter_columns(self): + # regression test for GH#22752 + + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) + + result = df.nlargest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(4, columns=["a", "b"]) + expected = pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_n_identical_values(self): + # GH#15297 + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) + + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) + def test_nlargest_n_duplicate_index(self, df_duplicates, n, order): + # GH#13412 + + df = df_duplicates + result = df.nsmallest(n, order) + expected = df.sort_values(order).head(n) + tm.assert_frame_equal(result, expected) + + result = df.nlargest(n, order) + expected = df.sort_values(order, ascending=False).head(n) + tm.assert_frame_equal(result, expected) + + def test_nlargest_duplicate_keep_all_ties(self): + # GH#16818 + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) + tm.assert_frame_equal(result, expected) + + def test_nlargest_multiindex_column_lookup(self): + # Check whether tuples are correctly treated as multi-level lookups. + # GH#23033 + df = pd.DataFrame( + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) + + # nsmallest + result = df.nsmallest(3, ("x", "a")) + expected = df.iloc[[2, 0, 3]] + tm.assert_frame_equal(result, expected) + + # nlargest + result = df.nlargest(3, ("x", "b")) + expected = df.iloc[[3, 2, 1]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py new file mode 100644 index 0000000000000..8f3f37fb9fff7 --- /dev/null +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFramePctChange: + def test_pct_change_numeric(self): + # GH#11150 + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) + pnl.iat[1, 0] = np.nan + pnl.iat[1, 1] = np.nan + pnl.iat[2, 3] = 60 + + for axis in range(2): + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") + + tm.assert_frame_equal(result, expected) + + def test_pct_change(self, datetime_frame): + rs = datetime_frame.pct_change(fill_method=None) + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) + + rs = datetime_frame.pct_change(2) + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.fillna(method="bfill", limit=1) + tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + df = DataFrame({"a": s, "b": s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) + tm.assert_frame_equal(chg, edf) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): + # GH#7292 + rs_freq = datetime_frame.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_frame.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_frame_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + data = DataFrame( + {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 + ) + result = data.pct_change(fill_method=fill_method) + if fill_method is None: + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] + else: + second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + expected = DataFrame( + {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, + index=["a", "b"] * 3, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py similarity index 96% rename from pandas/tests/frame/test_quantile.py rename to pandas/tests/frame/methods/test_quantile.py index 78953d43677fc..64461c08d34f4 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -3,10 +3,20 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameQuantile: + def test_quantile_sparse(self): + # GH#17198 + s = pd.Series(pd.arrays.SparseArray([1, 2])) + s1 = pd.Series(pd.arrays.SparseArray([3, 4])) + df = pd.DataFrame({0: s, 1: s1}) + result = df.quantile() + + expected = pd.Series([1.5, 3.5], name=0.5) + tm.assert_series_equal(result, expected) + def test_quantile(self, datetime_frame): from numpy import percentile @@ -93,8 +103,8 @@ def test_quantile_axis_parameter(self): with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) msg = ( - "No axis named column for object type" - " " + "No axis named column for object type " + "" ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") @@ -472,7 +482,7 @@ def test_quantile_empty_no_columns(self): df = pd.DataFrame(pd.date_range("1/1/18", periods=5)) df.columns.name = "captain tightpants" result = df.quantile(0.5) - expected = pd.Series([], index=[], name=0.5) + expected = pd.Series([], index=[], name=0.5, dtype=np.float64) expected.index.name = "captain tightpants" tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/methods/test_rank.py similarity index 94% rename from pandas/tests/frame/test_rank.py rename to pandas/tests/frame/methods/test_rank.py index be1a423c22aea..bab2db3192b4a 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,8 +3,10 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestRank: @@ -26,8 +28,10 @@ def method(self, request): """ return request.param + @td.skip_if_no_scipy def test_rank(self, float_frame): - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan @@ -109,6 +113,15 @@ def test_rank2(self): exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp) + def test_rank_does_not_mutate(self): + # GH#18521 + # Check rank does not mutate DataFrame + df = DataFrame(np.random.randn(10, 3), dtype="float64") + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) + def test_rank_mixed_frame(self, float_string_frame): float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -117,8 +130,10 @@ def test_rank_mixed_frame(self, float_string_frame): expected = float_string_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy def test_rank_na_option(self, float_frame): - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan @@ -199,9 +214,10 @@ def test_rank_axis(self): tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) + @td.skip_if_no_scipy def test_rank_methods_frame(self): - pytest.importorskip("scipy.stats.special") - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/methods/test_replace.py similarity index 95% rename from pandas/tests/frame/test_replace.py rename to pandas/tests/frame/methods/test_replace.py index 5eb2416d0dcd7..aa91e7a489356 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1,23 +1,23 @@ from datetime import datetime from io import StringIO import re -from typing import Dict +from typing import Dict, List, Union import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture -def mix_ab() -> Dict[str, list]: +def mix_ab() -> Dict[str, List[Union[int, str]]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture -def mix_abc() -> Dict[str, list]: +def mix_abc() -> Dict[str, List[Union[float, str]]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} @@ -1251,7 +1251,7 @@ def test_replace_with_empty_dictlike(self, mix_abc): # GH 15289 df = DataFrame(mix_abc) tm.assert_frame_equal(df, df.replace({})) - tm.assert_frame_equal(df, df.replace(Series([]))) + tm.assert_frame_equal(df, df.replace(Series([], dtype=object))) tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) @@ -1295,3 +1295,64 @@ def test_replace_method(self, to_replace, method, expected): result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + expected = DataFrame(final_data, columns=["a", "b"], dtype="category") + expected["a"] = expected["a"].cat.set_categories([1, 2, 3]) + expected["b"] = expected["b"].cat.set_categories([1, 2, 3]) + result = df.replace(replace_dict, 3) + tm.assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "df, to_replace, exp", + [ + ( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + {4: 5, 5: 6, 6: 7}, + {"col1": [1, 2, 3], "col2": [5, 6, 7]}, + ), + ( + {"col1": [1, 2, 3], "col2": ["4", "5", "6"]}, + {"4": "5", "5": "6", "6": "7"}, + {"col1": [1, 2, 3], "col2": ["5", "6", "7"]}, + ), + ], + ) + def test_replace_commutative(self, df, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + # also added to data frame whilst issue was for series + + df = pd.DataFrame(df) + + expected = pd.DataFrame(exp) + result = df.replace(to_replace) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "replacer", + [ + pd.Timestamp("20170827"), + np.int8(1), + np.int16(1), + np.float32(1), + np.float64(1), + ], + ) + def test_replace_replacer_dtype(self, replacer): + # GH26632 + df = pd.DataFrame(["a"]) + result = df.replace({"a": replacer, "b": replacer}) + expected = pd.DataFrame([replacer]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_round.py b/pandas/tests/frame/methods/test_round.py new file mode 100644 index 0000000000000..0865e03cedc50 --- /dev/null +++ b/pandas/tests/frame/methods/test_round.py @@ -0,0 +1,217 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, date_range +import pandas._testing as tm + + +class TestDataFrameRound: + def test_round(self): + # GH#2665 + + # Test that rounding an empty DataFrame does nothing + df = DataFrame() + tm.assert_frame_equal(df, df.round()) + + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(df.round(), expected_rounded) + + # Round with an integer + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # This should also work with np.round (since np.round dispatches to + # df.round) + tm.assert_frame_equal(np.round(df, decimals), expected_rounded) + + # Round with a list + round_list = [1, 2] + with pytest.raises(TypeError): + df.round(round_list) + + # Round with a dictionary + expected_rounded = DataFrame( + {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} + ) + round_dict = {"col1": 1, "col2": 2} + tm.assert_frame_equal(df.round(round_dict), expected_rounded) + + # Incomplete dict + expected_partially_rounded = DataFrame( + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + partial_round_dict = {"col2": 1} + tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) + + # Dict with unknown elements + wrong_round_dict = {"col3": 2, "col2": 1} + tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) + + # float input to `decimals` + non_int_round_dict = {"col1": 1, "col2": 0.5} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + # String input + non_int_round_dict = {"col1": 1, "col2": "foo"} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # List input + non_int_round_dict = {"col1": 1, "col2": [1, 2]} + with pytest.raises(TypeError): + df.round(non_int_round_dict) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # Non integer Series inputs + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + non_int_round_Series = Series(non_int_round_dict) + with pytest.raises(TypeError): + df.round(non_int_round_Series) + + # Negative numbers + negative_round_dict = {"col1": -1, "col2": -2} + big_df = df * 100 + expected_neg_rounded = DataFrame( + {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} + ) + tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) + + # nan in Series round + nan_round_Series = Series({"col1": np.nan, "col2": 1}) + + # TODO(wesm): unused? + expected_nan_round = DataFrame( # noqa + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + + with pytest.raises(TypeError): + df.round(nan_round_Series) + + # Make sure this doesn't break existing Series.round + tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) + + # named columns + # GH#11986 + decimals = 2 + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) + df.columns.name = "cols" + expected_rounded.columns.name = "cols" + tm.assert_frame_equal(df.round(decimals), expected_rounded) + + # interaction of named columns & series + tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) + tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) + + def test_round_numpy(self): + # GH#12600 + df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) + out = np.round(df, decimals=0) + expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) + tm.assert_frame_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(df, decimals=0, out=df) + + def test_round_numpy_with_nan(self): + # See GH#14197 + df = Series([1.53, np.nan, 0.06]).to_frame() + with tm.assert_produces_warning(None): + result = df.round() + expected = Series([2.0, np.nan, 0.0]).to_frame() + tm.assert_frame_equal(result, expected) + + def test_round_mixed_type(self): + # GH#11885 + df = DataFrame( + { + "col1": [1.1, 2.2, 3.3, 4.4], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + round_0 = DataFrame( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + tm.assert_frame_equal(df.round(), round_0) + tm.assert_frame_equal(df.round(1), df) + tm.assert_frame_equal(df.round({"col1": 1}), df) + tm.assert_frame_equal(df.round({"col1": 0}), round_0) + tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) + tm.assert_frame_equal(df.round({"col3": 1}), df) + + def test_round_with_duplicate_columns(self): + # GH#11611 + + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) + + dfs = pd.concat((df, df), axis=1) + rounded = dfs.round() + tm.assert_index_equal(rounded.index, dfs.index) + + decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) + msg = "Index of decimals must be unique" + with pytest.raises(ValueError, match=msg): + df.round(decimals) + + def test_round_builtin(self): + # GH#11763 + # Here's the test frame we'll be working with + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) + + # Default round to integer (i.e. decimals=0) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) + tm.assert_frame_equal(round(df), expected_rounded) + + def test_round_nonunique_categorical(self): + # See GH#21809 + idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) + df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) + + expected = df.round(3) + expected.index = idx + + df_categorical = df.copy().set_index(idx) + assert df_categorical.shape == (6, 3) + result = df_categorical.round(3) + assert result.shape == (6, 3) + + tm.assert_frame_equal(result, expected) + + def test_round_interval_category_columns(self): + # GH#30063 + columns = pd.CategoricalIndex(pd.interval_range(0, 2)) + df = DataFrame([[0.66, 1.1], [0.3, 0.25]], columns=columns) + + result = df.round() + expected = DataFrame([[1.0, 1.0], [0.0, 0.0]], columns=columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py new file mode 100644 index 0000000000000..cfb17de892b1c --- /dev/null +++ b/pandas/tests/frame/methods/test_shift.py @@ -0,0 +1,187 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, Series, date_range, offsets +import pandas._testing as tm + + +class TestDataFrameShift: + def test_shift(self, datetime_frame, int_frame): + # naive shift + shiftedFrame = datetime_frame.shift(5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) + + shiftedSeries = datetime_frame["A"].shift(5) + tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) + + shiftedFrame = datetime_frame.shift(-5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) + + shiftedSeries = datetime_frame["A"].shift(-5) + tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) + + # shift by 0 + unshifted = datetime_frame.shift(0) + tm.assert_frame_equal(unshifted, datetime_frame) + + # shift by DateOffset + shiftedFrame = datetime_frame.shift(5, freq=offsets.BDay()) + assert len(shiftedFrame) == len(datetime_frame) + + shiftedFrame2 = datetime_frame.shift(5, freq="B") + tm.assert_frame_equal(shiftedFrame, shiftedFrame2) + + d = datetime_frame.index[0] + shifted_d = d + offsets.BDay(5) + tm.assert_series_equal( + datetime_frame.xs(d), shiftedFrame.xs(shifted_d), check_names=False + ) + + # shift int frame + int_shifted = int_frame.shift(1) # noqa + + # Shifting with PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal( + unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values + ) + + shifted2 = ps.shift(1, "B") + shifted3 = ps.shift(1, offsets.BDay()) + tm.assert_frame_equal(shifted2, shifted3) + tm.assert_frame_equal(ps, shifted2.shift(-1, "B")) + + msg = "does not match PeriodIndex freq" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="D") + + # shift other axis + # GH#6371 + df = DataFrame(np.random.rand(10, 5)) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, expected) + + # shift named axis + df = DataFrame(np.random.rand(10, 5)) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis="columns") + tm.assert_frame_equal(result, expected) + + def test_shift_bool(self): + df = DataFrame({"high": [True, False], "low": [False, False]}) + rs = df.shift(1) + xp = DataFrame( + np.array([[np.nan, np.nan], [True, False]], dtype=object), + columns=["high", "low"], + ) + tm.assert_frame_equal(rs, xp) + + def test_shift_categorical(self): + # GH#9416 + s1 = pd.Series(["a", "b", "c"], dtype="category") + s2 = pd.Series(["A", "B", "C"], dtype="category") + df = DataFrame({"one": s1, "two": s2}) + rs = df.shift(1) + xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) + tm.assert_frame_equal(rs, xp) + + def test_shift_fill_value(self): + # GH#24128 + df = DataFrame( + [1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H") + ) + exp = DataFrame( + [0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = df.shift(1, fill_value=0) + tm.assert_frame_equal(result, exp) + + exp = DataFrame( + [0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = df.shift(2, fill_value=0) + tm.assert_frame_equal(result, exp) + + def test_shift_empty(self): + # Regression test for GH#8019 + df = DataFrame({"foo": []}) + rs = df.shift(-1) + + tm.assert_frame_equal(df, rs) + + def test_shift_duplicate_columns(self): + # GH#9092; verify that position-based shifting works + # in the presence of duplicate columns + column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]] + data = np.random.randn(20, 5) + + shifted = [] + for columns in column_lists: + df = pd.DataFrame(data.copy(), columns=columns) + for s in range(5): + df.iloc[:, s] = df.iloc[:, s].shift(s + 1) + df.columns = range(5) + shifted.append(df) + + # sanity check the base case + nulls = shifted[0].isna().sum() + tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) + + # check all answers are the same + tm.assert_frame_equal(shifted[0], shifted[1]) + tm.assert_frame_equal(shifted[0], shifted[2]) + + def test_tshift(self, datetime_frame): + # PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_frame_equal(unshifted, ps) + + shifted2 = ps.tshift(freq="B") + tm.assert_frame_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=offsets.BDay()) + tm.assert_frame_equal(shifted, shifted3) + + with pytest.raises(ValueError, match="does not match"): + ps.tshift(freq="M") + + # DatetimeIndex + shifted = datetime_frame.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_frame_equal(datetime_frame, unshifted) + + shifted2 = datetime_frame.tshift(freq=datetime_frame.index.freq) + tm.assert_frame_equal(shifted, shifted2) + + inferred_ts = DataFrame( + datetime_frame.values, + Index(np.asarray(datetime_frame.index)), + columns=datetime_frame.columns, + ) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + tm.assert_frame_equal(shifted, datetime_frame.tshift(1)) + tm.assert_frame_equal(unshifted, inferred_ts) + + no_freq = datetime_frame.iloc[[0, 5, 7], :] + msg = "Freq was not given and was not set in the index" + with pytest.raises(ValueError, match=msg): + no_freq.tshift() diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py new file mode 100644 index 0000000000000..2c25e1f3740a3 --- /dev/null +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -0,0 +1,320 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series +import pandas._testing as tm + + +class TestDataFrameSortIndex: + def test_sort_index_nan(self): + # GH#3917 + + # Test DataFrame with nan label + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + + # NaN label, ascending=True, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=True, na_position='first' + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4]}, + index=[np.nan, 1, 2, 3, 4, 5, 6], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='last' + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, np.nan], + ) + tm.assert_frame_equal(sorted_df, expected) + + # NaN label, ascending=False, na_position='first' + sorted_df = df.sort_index( + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9]}, + index=[np.nan, 6, 5, 4, 3, 2, 1], + ) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_index_multi_index(self): + # GH#25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) + + tm.assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) + + # axis=0 + unordered = frame.loc[[3, 2, 4, 1]] + a_id = id(unordered["A"]) + df = unordered.copy() + df.sort_index(inplace=True) + expected = frame + tm.assert_frame_equal(df, expected) + assert a_id != id(df["A"]) + + df = unordered.copy() + df.sort_index(ascending=False, inplace=True) + expected = frame[::-1] + tm.assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.loc[:, ["D", "B", "C", "A"]] + df = unordered.copy() + df.sort_index(axis=1, inplace=True) + expected = frame + tm.assert_frame_equal(df, expected) + + df = unordered.copy() + df.sort_index(axis=1, ascending=False, inplace=True) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(df, expected) + + def test_sort_index_different_sortorder(self): + A = np.arange(20).repeat(5) + B = np.tile(np.arange(5), 20) + + indexer = np.random.permutation(100) + A = A.take(indexer) + B = B.take(indexer) + + df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) + + ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) + expected = df.take(ex_indexer) + + # test with multiindex, too + idf = df.set_index(["A", "B"]) + + result = idf.sort_index(ascending=[1, 0]) + expected = idf.take(ex_indexer) + tm.assert_frame_equal(result, expected) + + # also, Series! + result = idf["C"].sort_index(ascending=[1, 0]) + tm.assert_series_equal(result, expected["C"]) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + df = DataFrame([[1, 2], [3, 4]], mi) + + result = df.sort_index(level="A", sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["A", "B"], sort_remaining=False) + expected = df + tm.assert_frame_equal(result, expected) + + # Error thrown by sort_index when + # first index is sorted last (GH#26053) + result = df.sort_index(level=["C", "B", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["B", "C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(level=["C", "A"]) + expected = df.iloc[[1, 0]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_categorical_index(self): + + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") + + result = df.sort_index() + expected = df.iloc[[4, 0, 1, 5, 2, 3]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + def test_sort_index(self): + # GH#13496 + + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) + + # axis=0 : sort rows by index labels + unordered = frame.loc[[3, 2, 4, 1]] + result = unordered.sort_index(axis=0) + expected = frame + tm.assert_frame_equal(result, expected) + + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + tm.assert_frame_equal(result, expected) + + # axis=1 : sort columns by column names + unordered = frame.iloc[:, [2, 1, 3, 0]] + result = unordered.sort_index(axis=1) + tm.assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.iloc[:, ::-1] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + # GH#13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + tm.assert_frame_equal(result, expected) + + # sort_remaining=False + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) + tm.assert_frame_equal(result, expected) + + def test_sort_index_intervalindex(self): + # this is a de-facto sort via unstack + # confirming that we sort in the order of the bins + y = Series(np.random.randn(100)) + x1 = Series(np.sign(np.random.randn(100))) + x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) + + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() + expected = IntervalIndex.from_tuples( + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) + result = result.columns.levels[1].categories + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MulitIndex of index + mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/methods/test_sort_values.py similarity index 53% rename from pandas/tests/frame/test_sorting.py rename to pandas/tests/frame/methods/test_sort_values.py index 9ea78b974fcbb..96f4d6ed90d6b 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -4,21 +4,11 @@ import pytest import pandas as pd -from pandas import ( - Categorical, - DataFrame, - IntervalIndex, - MultiIndex, - NaT, - Series, - Timestamp, - date_range, -) -from pandas.api.types import CategoricalDtype -import pandas.util.testing as tm - - -class TestDataFrameSorting: +from pandas import Categorical, DataFrame, NaT, Timestamp, date_range +import pandas._testing as tm + + +class TestDataFrameSortValues: def test_sort_values(self): frame = DataFrame( [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") @@ -57,7 +47,7 @@ def test_sort_values(self): with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=2, inplace=True) - # by row (axis=1): GH 10806 + # by row (axis=1): GH#10806 sorted_df = frame.sort_values(by=3, axis=1) expected = frame tm.assert_frame_equal(sorted_df, expected) @@ -106,21 +96,69 @@ def test_sort_values_inplace(self): expected = frame.sort_values(by=["A", "B"], ascending=False) tm.assert_frame_equal(sorted_df, expected) - def test_sort_nan(self): - # GH3917 - nan = np.nan - df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) + def test_sort_values_multicolumn(self): + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + random.shuffle(A) + random.shuffle(B) + frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) + + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) + expected = frame.take(indexer) + tm.assert_frame_equal(result, expected) + + def test_sort_values_multicolumn_uint64(self): + # GH#9918 + # uint64 multicolumn sort + + df = pd.DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + } + ) + df["a"] = df["a"].astype(np.uint64) + result = df.sort_values(["a", "b"]) + + expected = pd.DataFrame( + { + "a": pd.Series([18446637057563306014, 1162265347240853609]), + "b": pd.Series([1, 2]), + }, + index=pd.Index([1, 0]), + ) + + tm.assert_frame_equal(result, expected) + + def test_sort_values_nan(self): + # GH#3917 + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) # sort one column only expected = DataFrame( - {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A"], na_position="first") tm.assert_frame_equal(sorted_df, expected) expected = DataFrame( - {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3], ) sorted_df = df.sort_values(["A"], na_position="first", ascending=False) @@ -132,7 +170,7 @@ def test_sort_nan(self): # na_position='last', order expected = DataFrame( - {"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]}, + {"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]}, index=[3, 0, 1, 6, 4, 5, 2], ) sorted_df = df.sort_values(["A", "B"]) @@ -140,7 +178,7 @@ def test_sort_nan(self): # na_position='first', order expected = DataFrame( - {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]}, + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]}, index=[2, 3, 0, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A", "B"], na_position="first") @@ -148,7 +186,7 @@ def test_sort_nan(self): # na_position='first', not order expected = DataFrame( - {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, index=[2, 0, 3, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") @@ -156,54 +194,14 @@ def test_sort_nan(self): # na_position='last', not order expected = DataFrame( - {"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]}, + {"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]}, index=[5, 4, 6, 1, 3, 0, 2], ) sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") tm.assert_frame_equal(sorted_df, expected) - # Test DataFrame with nan label - df = DataFrame( - {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan], - ) - - # NaN label, ascending=True, na_position='last' - sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") - expected = DataFrame( - {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan], - ) - tm.assert_frame_equal(sorted_df, expected) - - # NaN label, ascending=True, na_position='first' - sorted_df = df.sort_index(na_position="first") - expected = DataFrame( - {"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]}, - index=[nan, 1, 2, 3, 4, 5, 6], - ) - tm.assert_frame_equal(sorted_df, expected) - - # NaN label, ascending=False, na_position='last' - sorted_df = df.sort_index(kind="quicksort", ascending=False) - expected = DataFrame( - {"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]}, - index=[6, 5, 4, 3, 2, 1, nan], - ) - tm.assert_frame_equal(sorted_df, expected) - - # NaN label, ascending=False, na_position='first' - sorted_df = df.sort_index( - kind="quicksort", ascending=False, na_position="first" - ) - expected = DataFrame( - {"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]}, - index=[nan, 6, 5, 4, 3, 2, 1], - ) - tm.assert_frame_equal(sorted_df, expected) - - def test_stable_descending_sort(self): - # GH #6399 + def test_sort_values_stable_descending_sort(self): + # GH#6399 df = DataFrame( [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], columns=["sort_col", "order"], @@ -211,12 +209,13 @@ def test_stable_descending_sort(self): sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) tm.assert_frame_equal(df, sorted_df) - def test_stable_descending_multicolumn_sort(self): - nan = np.nan - df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) + def test_sort_values_stable_descending_multicolumn_sort(self): + df = DataFrame( + {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} + ) # test stable mergesort expected = DataFrame( - {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]}, index=[2, 5, 4, 6, 1, 3, 0], ) sorted_df = df.sort_values( @@ -225,7 +224,7 @@ def test_stable_descending_multicolumn_sort(self): tm.assert_frame_equal(sorted_df, expected) expected = DataFrame( - {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, index=[2, 5, 4, 6, 1, 0, 3], ) sorted_df = df.sort_values( @@ -233,30 +232,16 @@ def test_stable_descending_multicolumn_sort(self): ) tm.assert_frame_equal(sorted_df, expected) - def test_sort_multi_index(self): - # GH 25775, testing that sorting by index works with a multi-index. - df = DataFrame( - {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} - ) - result = df.set_index(list("abc")).sort_index(level=list("ba")) - - expected = DataFrame( - {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} - ) - expected = expected.set_index(list("abc")) - - tm.assert_frame_equal(result, expected) - - def test_stable_categorial(self): - # GH 16793 + def test_sort_values_stable_categorial(self): + # GH#16793 df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) expected = df.copy() sorted_df = df.sort_values("x", kind="mergesort") tm.assert_frame_equal(sorted_df, expected) - def test_sort_datetimes(self): + def test_sort_values_datetimes(self): - # GH 3461, argsort / lexsort differences for a datetime column + # GH#3461, argsort / lexsort differences for a datetime column df = DataFrame( ["a", "a", "a", "b", "c", "d", "e", "f", "g"], columns=["A"], @@ -293,7 +278,7 @@ def test_sort_datetimes(self): df2 = df.sort_values(by=["C", "B"]) tm.assert_frame_equal(df1, df2) - def test_frame_column_inplace_sort_exception(self, float_frame): + def test_sort_values_frame_column_inplace_sort_exception(self, float_frame): s = float_frame["A"] with pytest.raises(ValueError, match="This Series is a view"): s.sort_values(inplace=True) @@ -301,9 +286,9 @@ def test_frame_column_inplace_sort_exception(self, float_frame): cp = s.copy() cp.sort_values() # it works! - def test_sort_nat_values_in_int_column(self): + def test_sort_values_nat_values_in_int_column(self): - # GH 14922: "sorting with large float and multiple columns incorrect" + # GH#14922: "sorting with large float and multiple columns incorrect" # cause was that the int64 value NaT was considered as "na". Which is # only correct for datetime64 columns. @@ -356,288 +341,8 @@ def test_sort_nat_values_in_int_column(self): df_sorted = df.sort_values(["datetime", "float"], ascending=False) tm.assert_frame_equal(df_sorted, df) - def test_sort_nat(self): - - # GH 16836 - - d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] - d2 = [ - Timestamp(x) - for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] - ] - df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) - - d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] - d4 = [ - Timestamp(x) - for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] - ] - expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) - sorted_df = df.sort_values(by=["a", "b"]) - tm.assert_frame_equal(sorted_df, expected) - - -class TestDataFrameSortIndexKinds: - def test_sort_index_multicolumn(self): - A = np.arange(5).repeat(20) - B = np.tile(np.arange(5), 20) - random.shuffle(A) - random.shuffle(B) - frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) - - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=["A", "B"]) - result = frame.sort_values(by=["A", "B"]) - indexer = np.lexsort((frame["B"], frame["A"])) - expected = frame.take(indexer) - tm.assert_frame_equal(result, expected) - - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=["A", "B"], ascending=False) - result = frame.sort_values(by=["A", "B"], ascending=False) - indexer = np.lexsort( - (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) - ) - expected = frame.take(indexer) - tm.assert_frame_equal(result, expected) - - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=["B", "A"]) - result = frame.sort_values(by=["B", "A"]) - indexer = np.lexsort((frame["A"], frame["B"])) - expected = frame.take(indexer) - tm.assert_frame_equal(result, expected) - - def test_sort_index_inplace(self): - frame = DataFrame( - np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] - ) - - # axis=0 - unordered = frame.loc[[3, 2, 4, 1]] - a_id = id(unordered["A"]) - df = unordered.copy() - df.sort_index(inplace=True) - expected = frame - tm.assert_frame_equal(df, expected) - assert a_id != id(df["A"]) - - df = unordered.copy() - df.sort_index(ascending=False, inplace=True) - expected = frame[::-1] - tm.assert_frame_equal(df, expected) - - # axis=1 - unordered = frame.loc[:, ["D", "B", "C", "A"]] - df = unordered.copy() - df.sort_index(axis=1, inplace=True) - expected = frame - tm.assert_frame_equal(df, expected) - - df = unordered.copy() - df.sort_index(axis=1, ascending=False, inplace=True) - expected = frame.iloc[:, ::-1] - tm.assert_frame_equal(df, expected) - - def test_sort_index_different_sortorder(self): - A = np.arange(20).repeat(5) - B = np.tile(np.arange(5), 20) - - indexer = np.random.permutation(100) - A = A.take(indexer) - B = B.take(indexer) - - df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) - - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=["A", "B"], ascending=[1, 0]) - result = df.sort_values(by=["A", "B"], ascending=[1, 0]) - - ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) - expected = df.take(ex_indexer) - tm.assert_frame_equal(result, expected) - - # test with multiindex, too - idf = df.set_index(["A", "B"]) - - result = idf.sort_index(ascending=[1, 0]) - expected = idf.take(ex_indexer) - tm.assert_frame_equal(result, expected) - - # also, Series! - result = idf["C"].sort_index(ascending=[1, 0]) - tm.assert_series_equal(result, expected["C"]) - - def test_sort_index_duplicates(self): - - # with 9816, these are all translated to .sort_values - - df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"]) - - with pytest.raises(ValueError, match="not unique"): - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by="a") - with pytest.raises(ValueError, match="not unique"): - df.sort_values(by="a") - - with pytest.raises(ValueError, match="not unique"): - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=["a"]) - with pytest.raises(ValueError, match="not unique"): - df.sort_values(by=["a"]) - - with pytest.raises(ValueError, match="not unique"): - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - # multi-column 'by' is separate codepath - df.sort_index(by=["a", "b"]) - with pytest.raises(ValueError, match="not unique"): - # multi-column 'by' is separate codepath - df.sort_values(by=["a", "b"]) - - # with multi-index - # GH4370 - df = DataFrame( - np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)]) - ) - with pytest.raises(ValueError, match="level"): - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by="a") - with pytest.raises(ValueError, match="level"): - df.sort_values(by="a") - - # convert tuples to a list of tuples - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=[("a", 1)]) - expected = df.sort_values(by=[("a", 1)]) - - # use .sort_values #9816 - with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=("a", 1)) - result = df.sort_values(by=("a", 1)) - tm.assert_frame_equal(result, expected) - - def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) - df = DataFrame([[1, 2], [3, 4]], mi) - - result = df.sort_index(level="A", sort_remaining=False) - expected = df - tm.assert_frame_equal(result, expected) - - result = df.sort_index(level=["A", "B"], sort_remaining=False) - expected = df - tm.assert_frame_equal(result, expected) - - # Error thrown by sort_index when - # first index is sorted last (#26053) - result = df.sort_index(level=["C", "B", "A"]) - expected = df.iloc[[1, 0]] - tm.assert_frame_equal(result, expected) - - result = df.sort_index(level=["B", "C", "A"]) - expected = df.iloc[[1, 0]] - tm.assert_frame_equal(result, expected) - - result = df.sort_index(level=["C", "A"]) - expected = df.iloc[[1, 0]] - tm.assert_frame_equal(result, expected) - - def test_sort_index_categorical_index(self): - - df = DataFrame( - { - "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), - } - ).set_index("B") - - result = df.sort_index() - expected = df.iloc[[4, 0, 1, 5, 2, 3]] - tm.assert_frame_equal(result, expected) - - result = df.sort_index(ascending=False) - expected = df.iloc[[2, 3, 0, 1, 5, 4]] - tm.assert_frame_equal(result, expected) - - def test_sort_index(self): - # GH13496 - - frame = DataFrame( - np.arange(16).reshape(4, 4), - index=[1, 2, 3, 4], - columns=["A", "B", "C", "D"], - ) - - # axis=0 : sort rows by index labels - unordered = frame.loc[[3, 2, 4, 1]] - result = unordered.sort_index(axis=0) - expected = frame - tm.assert_frame_equal(result, expected) - - result = unordered.sort_index(ascending=False) - expected = frame[::-1] - tm.assert_frame_equal(result, expected) - - # axis=1 : sort columns by column names - unordered = frame.iloc[:, [2, 1, 3, 0]] - result = unordered.sort_index(axis=1) - tm.assert_frame_equal(result, frame) - - result = unordered.sort_index(axis=1, ascending=False) - expected = frame.iloc[:, ::-1] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 - def test_sort_index_multiindex(self, level): - # GH13496 - - # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples( - [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") - ) - df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) - - expected_mi = MultiIndex.from_tuples( - [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") - ) - expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) - result = df.sort_index(level=level) - tm.assert_frame_equal(result, expected) - - # sort_remaining=False - expected_mi = MultiIndex.from_tuples( - [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") - ) - expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) - result = df.sort_index(level=level, sort_remaining=False) - tm.assert_frame_equal(result, expected) - - def test_sort_index_intervalindex(self): - # this is a de-facto sort via unstack - # confirming that we sort in the order of the bins - y = Series(np.random.randn(100)) - x1 = Series(np.sign(np.random.randn(100))) - x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) - model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) - - result = model.groupby(["X1", "X2"], observed=True).mean().unstack() - expected = IntervalIndex.from_tuples( - [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" - ) - result = result.columns.levels[1].categories - tm.assert_index_equal(result, expected) - - def test_sort_index_na_position_with_categories(self): - # GH 22556 + def test_sort_values_na_position_with_categories(self): + # GH#22556 # Positioning missing value properly when column is Categorical. categories = ["A", "B", "C"] category_indices = [0, 2, 4] @@ -722,7 +427,27 @@ def test_sort_index_na_position_with_categories(self): tm.assert_frame_equal(result, expected) - def test_sort_index_na_position_with_categories_raises(self): + def test_sort_values_nat(self): + + # GH#16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_na_position_with_categories_raises(self): df = pd.DataFrame( { "c": pd.Categorical( @@ -735,3 +460,59 @@ def test_sort_index_na_position_with_categories_raises(self): with pytest.raises(ValueError): df.sort_values(by="c", ascending=False, na_position="bad_position") + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + True, + [0, 1, 2], + ), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + False, + [2, 1, 0], + ), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_dict, sorted_dict, ignore_index, output_index + ): + # GH 30114 + df = DataFrame(original_dict) + expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(original_dict)) + + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = pd.DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } + ) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py new file mode 100644 index 0000000000000..7b0adceb57668 --- /dev/null +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -0,0 +1,258 @@ +from collections import OrderedDict, defaultdict +from datetime import datetime + +import numpy as np +import pytest +import pytz + +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameToDict: + def test_to_dict_timestamp(self): + + # GH#11247 + # split/records producing np.datetime64 rather than Timestamps + # on datetime64[ns] dtypes only + + tsmp = Timestamp("20130101") + test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) + test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) + + expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] + expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] + + assert test_data.to_dict(orient="records") == expected_records + assert test_data_mixed.to_dict(orient="records") == expected_records_mixed + + expected_series = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([tsmp, tsmp], name="B"), + } + expected_series_mixed = { + "A": Series([tsmp, tsmp], name="A"), + "B": Series([1, 2], name="B"), + } + + tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="series"), expected_series_mixed + ) + + expected_split = { + "index": [0, 1], + "data": [[tsmp, tsmp], [tsmp, tsmp]], + "columns": ["A", "B"], + } + expected_split_mixed = { + "index": [0, 1], + "data": [[tsmp, 1], [tsmp, 2]], + "columns": ["A", "B"], + } + + tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) + tm.assert_dict_equal( + test_data_mixed.to_dict(orient="split"), expected_split_mixed + ) + + def test_to_dict_index_not_unique_with_index_orient(self): + # GH#22801 + # Data loss when indexes are not unique. Raise ValueError. + df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) + msg = "DataFrame index must be unique for orient='index'" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="index") + + def test_to_dict_invalid_orient(self): + df = DataFrame({"A": [0, 1]}) + msg = "orient 'xinvalid' not understood" + with pytest.raises(ValueError, match=msg): + df.to_dict(orient="xinvalid") + + @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) + def test_to_dict(self, mapping): + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} + + # GH#16122 + recons_data = DataFrame(test_data).to_dict(into=mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("l", mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][int(k2) - 1] + + recons_data = DataFrame(test_data).to_dict("s", mapping) + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k][k2] + + recons_data = DataFrame(test_data).to_dict("sp", mapping) + expected_split = { + "columns": ["A", "B"], + "index": ["1", "2", "3"], + "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], + } + tm.assert_dict_equal(recons_data, expected_split) + + recons_data = DataFrame(test_data).to_dict("r", mapping) + expected_records = [ + {"A": 1.0, "B": "1"}, + {"A": 2.0, "B": "2"}, + {"A": np.nan, "B": "3"}, + ] + assert isinstance(recons_data, list) + assert len(recons_data) == 3 + for l, r in zip(recons_data, expected_records): + tm.assert_dict_equal(l, r) + + # GH#10844 + recons_data = DataFrame(test_data).to_dict("i") + + for k, v in test_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + df = DataFrame(test_data) + df["duped"] = df[df.columns[0]] + recons_data = df.to_dict("i") + comp_data = test_data.copy() + comp_data["duped"] = comp_data[df.columns[0]] + for k, v in comp_data.items(): + for k2, v2 in v.items(): + assert v2 == recons_data[k2][k] + + @pytest.mark.parametrize("mapping", [list, defaultdict, []]) + def test_to_dict_errors(self, mapping): + # GH#16122 + df = DataFrame(np.random.randn(3, 3)) + with pytest.raises(TypeError): + df.to_dict(into=mapping) + + def test_to_dict_not_unique_warning(self): + # GH#16927: When converting to a dict, if a column has a non-unique name + # it will be dropped, throwing a warning. + df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) + with tm.assert_produces_warning(UserWarning): + df.to_dict() + + # orient - orient argument to to_dict function + # item_getter - function for extracting value from + # the resulting dict using column name and index + @pytest.mark.parametrize( + "orient,item_getter", + [ + ("dict", lambda d, col, idx: d[col][idx]), + ("records", lambda d, col, idx: d[idx][col]), + ("list", lambda d, col, idx: d[col][idx]), + ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), + ("index", lambda d, col, idx: d[idx][col]), + ], + ) + def test_to_dict_box_scalars(self, orient, item_getter): + # GH#14216, GH#23753 + # make sure that we are boxing properly + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, "a", 0), int) + assert isinstance(item_getter(result, "b", 0), float) + + def test_to_dict_tz(self): + # GH#18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [ + (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), + ] + df = DataFrame(list(data), columns=["d"]) + + result = df.to_dict(orient="records") + expected = [ + {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, + {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) + + @pytest.mark.parametrize( + "into, expected", + [ + ( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ( + OrderedDict, + OrderedDict( + [ + (0, {"int_col": 1, "float_col": 1.0}), + (1, {"int_col": 2, "float_col": 2.0}), + (2, {"int_col": 3, "float_col": 3.0}), + ] + ), + ), + ( + defaultdict(dict), + defaultdict( + dict, + { + 0: {"int_col": 1, "float_col": 1.0}, + 1: {"int_col": 2, "float_col": 2.0}, + 2: {"int_col": 3, "float_col": 3.0}, + }, + ), + ), + ], + ) + def test_to_dict_index_dtypes(self, into, expected): + # GH#18580 + # When using to_dict(orient='index') on a dataframe with int + # and float columns only the int columns were cast to float + + df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) + + result = df.to_dict(orient="index", into=into) + cols = ["int_col", "float_col"] + result = DataFrame.from_dict(result, orient="index")[cols] + expected = DataFrame.from_dict(expected, orient="index")[cols] + tm.assert_frame_equal(result, expected) + + def test_to_dict_numeric_names(self): + # GH#24940 + df = DataFrame({str(i): [i] for i in range(5)}) + result = set(df.to_dict("records")[0].keys()) + expected = set(df.columns) + assert result == expected + + def test_to_dict_wide(self): + # GH#24939 + df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)}) + result = df.to_dict("records")[0] + expected = {"A_{:d}".format(i): i for i in range(256)} + assert result == expected + + def test_to_dict_orient_dtype(self): + # GH#22620 + # Input Data + input_data = {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["X", "Y", "Z"]} + df = DataFrame(input_data) + # Expected Dtypes + expected = {"a": int, "b": float, "c": str} + # Extracting dtypes out of to_dict operation + for df_dict in df.to_dict("records"): + result = { + "a": type(df_dict["a"]), + "b": type(df_dict["b"]), + "c": type(df_dict["c"]), + } + assert result == expected diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/methods/test_to_records.py similarity index 57% rename from pandas/tests/frame/test_convert_to.py rename to pandas/tests/frame/methods/test_to_records.py index 17edd48e36563..54a3affdc3024 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -1,107 +1,25 @@ -from collections import OrderedDict, abc, defaultdict -from datetime import datetime +from collections import abc import numpy as np import pytest -import pytz - -from pandas import ( - CategoricalDtype, - DataFrame, - MultiIndex, - Series, - Timestamp, - date_range, -) -import pandas.util.testing as tm - - -class TestDataFrameConvertTo: - def test_to_dict_timestamp(self): - - # GH11247 - # split/records producing np.datetime64 rather than Timestamps - # on datetime64[ns] dtypes only - - tsmp = Timestamp("20130101") - test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) - test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) - - expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] - expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] - - assert test_data.to_dict(orient="records") == expected_records - assert test_data_mixed.to_dict(orient="records") == expected_records_mixed - - expected_series = { - "A": Series([tsmp, tsmp], name="A"), - "B": Series([tsmp, tsmp], name="B"), - } - expected_series_mixed = { - "A": Series([tsmp, tsmp], name="A"), - "B": Series([1, 2], name="B"), - } - - tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) - tm.assert_dict_equal( - test_data_mixed.to_dict(orient="series"), expected_series_mixed - ) - - expected_split = { - "index": [0, 1], - "data": [[tsmp, tsmp], [tsmp, tsmp]], - "columns": ["A", "B"], - } - expected_split_mixed = { - "index": [0, 1], - "data": [[tsmp, 1], [tsmp, 2]], - "columns": ["A", "B"], - } - - tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) - tm.assert_dict_equal( - test_data_mixed.to_dict(orient="split"), expected_split_mixed - ) - def test_to_dict_index_not_unique_with_index_orient(self): - # GH22801 - # Data loss when indexes are not unique. Raise ValueError. - df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) - msg = "DataFrame index must be unique for orient='index'" - with pytest.raises(ValueError, match=msg): - df.to_dict(orient="index") +from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range +import pandas._testing as tm - def test_to_dict_invalid_orient(self): - df = DataFrame({"A": [0, 1]}) - msg = "orient 'xinvalid' not understood" - with pytest.raises(ValueError, match=msg): - df.to_dict(orient="xinvalid") +class TestDataFrameToRecords: def test_to_records_dt64(self): df = DataFrame( [["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02"), ) - # convert_datetime64 defaults to None expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result - # check for FutureWarning if convert_datetime64=False is passed - with tm.assert_produces_warning(FutureWarning): - expected = df.index.values[0] - result = df.to_records(convert_datetime64=False)["index"][0] - assert expected == result - - # check for FutureWarning if convert_datetime64=True is passed - with tm.assert_produces_warning(FutureWarning): - expected = df.index[0] - result = df.to_records(convert_datetime64=True)["index"][0] - assert expected == result - def test_to_records_with_multindex(self): - # GH3189 + # GH#3189 index = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], @@ -149,7 +67,7 @@ def test_to_records_index_name(self): assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): - # GH13172 + # GH#13172 # unicode_literals conflict with to_records result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) @@ -157,7 +75,7 @@ def test_to_records_with_unicode_index(self): def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 - # Issue #11879. to_records used to raise an exception when used + # Issue GH#11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() @@ -170,8 +88,7 @@ def test_to_records_with_unicode_column_names(self): tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): - - # GH8626 + # GH#8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") @@ -323,7 +240,7 @@ def test_to_records_with_categorical(self): ], ) def test_to_records_dtype(self, kwargs, expected): - # see gh-18146 + # see GH#18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): @@ -396,12 +313,12 @@ def test_to_records_dtype(self, kwargs, expected): ], ) def test_to_records_dtype_mi(self, df, kwargs, expected): - # see gh-18146 + # see GH#18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) def test_to_records_dict_like(self): - # see gh-18146 + # see GH#18146 class DictLike: def __init__(self, **kwargs): self.d = kwargs.copy() @@ -409,7 +326,7 @@ def __init__(self, **kwargs): def __getitem__(self, key): return self.d.__getitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: return key in self.d def keys(self): @@ -429,81 +346,9 @@ def keys(self): ) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) - def test_to_dict(self, mapping): - test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} - - # GH16122 - recons_data = DataFrame(test_data).to_dict(into=mapping) - - for k, v in test_data.items(): - for k2, v2 in v.items(): - assert v2 == recons_data[k][k2] - - recons_data = DataFrame(test_data).to_dict("l", mapping) - - for k, v in test_data.items(): - for k2, v2 in v.items(): - assert v2 == recons_data[k][int(k2) - 1] - - recons_data = DataFrame(test_data).to_dict("s", mapping) - - for k, v in test_data.items(): - for k2, v2 in v.items(): - assert v2 == recons_data[k][k2] - - recons_data = DataFrame(test_data).to_dict("sp", mapping) - expected_split = { - "columns": ["A", "B"], - "index": ["1", "2", "3"], - "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], - } - tm.assert_dict_equal(recons_data, expected_split) - - recons_data = DataFrame(test_data).to_dict("r", mapping) - expected_records = [ - {"A": 1.0, "B": "1"}, - {"A": 2.0, "B": "2"}, - {"A": np.nan, "B": "3"}, - ] - assert isinstance(recons_data, list) - assert len(recons_data) == 3 - for l, r in zip(recons_data, expected_records): - tm.assert_dict_equal(l, r) - - # GH10844 - recons_data = DataFrame(test_data).to_dict("i") - - for k, v in test_data.items(): - for k2, v2 in v.items(): - assert v2 == recons_data[k2][k] - - df = DataFrame(test_data) - df["duped"] = df[df.columns[0]] - recons_data = df.to_dict("i") - comp_data = test_data.copy() - comp_data["duped"] = comp_data[df.columns[0]] - for k, v in comp_data.items(): - for k2, v2 in v.items(): - assert v2 == recons_data[k2][k] - - @pytest.mark.parametrize("mapping", [list, defaultdict, []]) - def test_to_dict_errors(self, mapping): - # GH16122 - df = DataFrame(np.random.randn(3, 3)) - with pytest.raises(TypeError): - df.to_dict(into=mapping) - - def test_to_dict_not_unique_warning(self): - # GH16927: When converting to a dict, if a column has a non-unique name - # it will be dropped, throwing a warning. - df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) - with tm.assert_produces_warning(UserWarning): - df.to_dict() - @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): - # GH13937 + # GH#13937 dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) df = DataFrame({"datetime": dr}, index=dr) @@ -513,102 +358,3 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) - - # orient - orient argument to to_dict function - # item_getter - function for extracting value from - # the resulting dict using column name and index - @pytest.mark.parametrize( - "orient,item_getter", - [ - ("dict", lambda d, col, idx: d[col][idx]), - ("records", lambda d, col, idx: d[idx][col]), - ("list", lambda d, col, idx: d[col][idx]), - ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), - ("index", lambda d, col, idx: d[idx][col]), - ], - ) - def test_to_dict_box_scalars(self, orient, item_getter): - # 14216, 23753 - # make sure that we are boxing properly - df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) - result = df.to_dict(orient=orient) - assert isinstance(item_getter(result, "a", 0), int) - assert isinstance(item_getter(result, "b", 0), float) - - def test_frame_to_dict_tz(self): - # GH18372 When converting to dict with orient='records' columns of - # datetime that are tz-aware were not converted to required arrays - data = [ - (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc),), - ] - df = DataFrame(list(data), columns=["d"]) - - result = df.to_dict(orient="records") - expected = [ - {"d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc)}, - {"d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc)}, - ] - tm.assert_dict_equal(result[0], expected[0]) - tm.assert_dict_equal(result[1], expected[1]) - - @pytest.mark.parametrize( - "into, expected", - [ - ( - dict, - { - 0: {"int_col": 1, "float_col": 1.0}, - 1: {"int_col": 2, "float_col": 2.0}, - 2: {"int_col": 3, "float_col": 3.0}, - }, - ), - ( - OrderedDict, - OrderedDict( - [ - (0, {"int_col": 1, "float_col": 1.0}), - (1, {"int_col": 2, "float_col": 2.0}), - (2, {"int_col": 3, "float_col": 3.0}), - ] - ), - ), - ( - defaultdict(dict), - defaultdict( - dict, - { - 0: {"int_col": 1, "float_col": 1.0}, - 1: {"int_col": 2, "float_col": 2.0}, - 2: {"int_col": 3, "float_col": 3.0}, - }, - ), - ), - ], - ) - def test_to_dict_index_dtypes(self, into, expected): - # GH 18580 - # When using to_dict(orient='index') on a dataframe with int - # and float columns only the int columns were cast to float - - df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) - - result = df.to_dict(orient="index", into=into) - cols = ["int_col", "float_col"] - result = DataFrame.from_dict(result, orient="index")[cols] - expected = DataFrame.from_dict(expected, orient="index")[cols] - tm.assert_frame_equal(result, expected) - - def test_to_dict_numeric_names(self): - # https://github.com/pandas-dev/pandas/issues/24940 - df = DataFrame({str(i): [i] for i in range(5)}) - result = set(df.to_dict("records")[0].keys()) - expected = set(df.columns) - assert result == expected - - def test_to_dict_wide(self): - # https://github.com/pandas-dev/pandas/issues/24939 - df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)}) - result = df.to_dict("records")[0] - expected = {"A_{:d}".format(i): i for i in range(256)} - assert result == expected diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py new file mode 100644 index 0000000000000..428b9e5068407 --- /dev/null +++ b/pandas/tests/frame/methods/test_transpose.py @@ -0,0 +1,43 @@ +import pandas as pd +import pandas._testing as tm + + +class TestTranspose: + def test_transpose_tzaware_1col_single_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df = pd.DataFrame(dti) + assert (df.dtypes == dti.dtype).all() + res = df.T + assert (res.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_single_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + + df3 = pd.DataFrame({"A": dti, "B": dti}) + assert (df3.dtypes == dti.dtype).all() + res3 = df3.T + assert (res3.dtypes == dti.dtype).all() + + def test_transpose_tzaware_2col_mixed_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + df4 = pd.DataFrame({"A": dti, "B": dti2}) + assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() + assert (df4.T.dtypes == object).all() + tm.assert_frame_equal(df4.T.T, df4) + + def test_transpose_object_to_tzaware_mixed_tz(self): + # GH#26825 + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") + + # mixed all-tzaware dtypes + df2 = pd.DataFrame([dti, dti2]) + assert (df2.dtypes == object).all() + res2 = df2.T + assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py new file mode 100644 index 0000000000000..ad86ee1266874 --- /dev/null +++ b/pandas/tests/frame/methods/test_truncate.py @@ -0,0 +1,89 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDataFrameTruncate: + def test_truncate(self, datetime_frame): + ts = datetime_frame[::3] + + start, end = datetime_frame.index[3], datetime_frame.index[6] + + start_missing = datetime_frame.index[2] + end_missing = datetime_frame.index[7] + + # neither specified + truncated = ts.truncate() + tm.assert_frame_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + tm.assert_frame_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + tm.assert_frame_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + tm.assert_frame_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + tm.assert_frame_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + tm.assert_frame_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + tm.assert_frame_equal(truncated, expected) + + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" + with pytest.raises(ValueError, match=msg): + ts.truncate( + before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq + ) + + def test_truncate_copy(self, datetime_frame): + index = datetime_frame.index + truncated = datetime_frame.truncate(index[5], index[10]) + truncated.values[:] = 5.0 + assert not (datetime_frame.values[5:11] == 5).any() + + def test_truncate_nonsortedindex(self): + # GH#17935 + + df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + df.truncate(before=3, after=9) + + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") + ts = pd.DataFrame( + {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng + ) + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + ts.sort_values("A", ascending=False).truncate( + before="2011-11", after="2011-12" + ) + + df = pd.DataFrame( + { + 3: np.random.randn(5), + 20: np.random.randn(5), + 2: np.random.randn(5), + 0: np.random.randn(5), + }, + columns=[3, 20, 2, 0], + ) + msg = "truncate requires a sorted index" + with pytest.raises(ValueError, match=msg): + df.truncate(before=2, after=20, axis=1) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 11d73fc37105e..602ea9ca0471a 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -25,7 +25,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameAlterAxes: @@ -341,8 +341,8 @@ def __init__(self, name, color): self.name = name self.color = color - def __str__(self): - return "".format(self=self) + def __str__(self) -> str: + return f"" # necessary for pretty KeyError __repr__ = __str__ @@ -380,8 +380,8 @@ def test_set_index_custom_label_hashable_iterable(self): class Thing(frozenset): # need to stabilize repr for KeyError (due to random order in sets) - def __repr__(self): - tmp = sorted(list(self)) + def __repr__(self) -> str: + tmp = sorted(self) # double curly brace prints one brace in format string return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) @@ -418,8 +418,8 @@ def __init__(self, name, color): self.name = name self.color = color - def __str__(self): - return "".format(self=self) + def __str__(self) -> str: + return f"" thing1 = Thing("One", "red") thing2 = Thing("Two", "blue") @@ -493,29 +493,29 @@ def test_convert_dti_to_series(self): tm.assert_series_equal(result, expected) # convert to series while keeping the timezone - result = idx.to_series(keep_tz=True, index=[0, 1]) + msg = "stop passing 'keep_tz'" + with tm.assert_produces_warning(FutureWarning) as m: + result = idx.to_series(keep_tz=True, index=[0, 1]) tm.assert_series_equal(result, expected) + assert msg in str(m[0].message) # convert to utc - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning) as m: df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) result = df["B"] comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) - - with tm.assert_produces_warning(FutureWarning) as m: - result = idx.to_series(index=[0, 1]) - tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = ( - "The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change to True in a future " - "release." - ) + msg = "do 'idx.tz_convert(None)' before calling" assert msg in str(m[0].message) - with tm.assert_produces_warning(FutureWarning): + result = idx.to_series(index=[0, 1]) + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(keep_tz=False, index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) + msg = "do 'idx.tz_convert(None)' before calling" + assert msg in str(m[0].message) # list of datetimes with a tz df["B"] = idx.to_pydatetime() @@ -745,8 +745,7 @@ def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) df = DataFrame( - {"x": [i for i in range(len(mi))], "y": [i * 10 for i in range(len(mi))]}, - index=mi, + {"x": list(range(len(mi))), "y": [i * 10 for i in range(len(mi))]}, index=mi ) # Test for rename of the Index object of columns @@ -1313,7 +1312,7 @@ def test_rename_mapper_multi(self): def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) - result = df.rename(str.lower, columns=str.upper) + result = df.rename(index=str.lower, columns=str.upper) expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) tm.assert_frame_equal(result, expected) @@ -1337,12 +1336,12 @@ def test_rename_axis_style_raises(self): # Multiple targets and axis with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, axis="columns") + df.rename(str.lower, index=str.lower, axis="columns") # Too many targets - over_spec_msg = "Cannot specify all of 'mapper', 'index', 'columns'." + over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, str.lower) + df.rename(str.lower, index=str.lower, columns=str.lower) # Duplicates with pytest.raises(TypeError, match="multiple values"): @@ -1376,16 +1375,42 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_rename_positional(self): + def test_rename_positional_raises(self): + # GH 29136 df = DataFrame(columns=["A", "B"]) - with tm.assert_produces_warning(FutureWarning) as rec: - result = df.rename(None, str.lower) - expected = DataFrame(columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - assert len(rec) == 1 - message = str(rec[0].message) - assert "rename" in message - assert "Use named arguments" in message + msg = r"rename\(\) takes from 1 to 2 positional arguments" + + with pytest.raises(TypeError, match=msg): + df.rename(None, str.lower) + + def test_rename_no_mappings_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "must pass an index to rename" + with pytest.raises(TypeError, match=msg): + df.rename() + + with pytest.raises(TypeError, match=msg): + df.rename(None, index=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None, index=None) + + def test_rename_mapper_and_positional_arguments_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=msg): + df.rename({}, index={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}, index={}) def test_assign_columns(self, float_frame): float_frame["hi"] = "there" @@ -1410,14 +1435,6 @@ def test_set_index_preserve_categorical_dtype(self): result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) - def test_ambiguous_warns(self): - df = DataFrame({"A": [1, 2]}) - with tm.assert_produces_warning(FutureWarning): - df.rename(id, id) - - with tm.assert_produces_warning(FutureWarning): - df.rename({0: 10}, {"A": "B"}) - def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) @@ -1549,21 +1566,3 @@ def test_set_axis_inplace(self): for axis in 3, "foo": with pytest.raises(ValueError, match="No axis named"): df.set_axis(list("abc"), axis=axis) - - def test_set_axis_prior_to_deprecation_signature(self): - df = DataFrame( - {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012], - ) - - expected = {0: df.copy(), 1: df.copy()} - expected[0].index = list("abc") - expected[1].columns = list("abc") - expected["index"] = expected[0] - expected["columns"] = expected[1] - - # old signature - for axis in expected: - with tm.assert_produces_warning(FutureWarning): - result = df.set_axis(axis, list("abc"), inplace=False) - tm.assert_frame_equal(result, expected[axis]) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e99208ac78e15..910230c737a2a 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1,7 +1,6 @@ from datetime import timedelta +from decimal import Decimal import operator -from string import ascii_lowercase -import warnings import numpy as np import pytest @@ -21,9 +20,9 @@ to_datetime, to_timedelta, ) +import pandas._testing as tm import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops -import pandas.util.testing as tm def assert_stat_op_calc( @@ -259,606 +258,6 @@ def assert_bool_op_api( class TestDataFrameAnalytics: - # --------------------------------------------------------------------- - # Correlation and covariance - - @td.skip_if_no_scipy - def test_corr_pearson(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "pearson") - - @td.skip_if_no_scipy - def test_corr_kendall(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "kendall") - - @td.skip_if_no_scipy - def test_corr_spearman(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "spearman") - - def _check_method(self, frame, method="pearson"): - correls = frame.corr(method=method) - expected = frame["A"].corr(frame["C"], method=method) - tm.assert_almost_equal(correls["A"]["C"], expected) - - @td.skip_if_no_scipy - def test_corr_non_numeric(self, float_frame, float_string_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - # exclude non-numeric types - result = float_string_frame.corr() - expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() - tm.assert_frame_equal(result, expected) - - @td.skip_if_no_scipy - @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) - def test_corr_nooverlap(self, meth): - # nothing in common - df = DataFrame( - { - "A": [1, 1.5, 1, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], - "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], - } - ) - rs = df.corr(meth) - assert isna(rs.loc["A", "B"]) - assert isna(rs.loc["B", "A"]) - assert rs.loc["A", "A"] == 1 - assert rs.loc["B", "B"] == 1 - assert isna(rs.loc["C", "C"]) - - @td.skip_if_no_scipy - @pytest.mark.parametrize("meth", ["pearson", "spearman"]) - def test_corr_constant(self, meth): - # constant --> all NA - - df = DataFrame( - { - "A": [1, 1, 1, np.nan, np.nan, np.nan], - "B": [np.nan, np.nan, np.nan, 1, 1, 1], - } - ) - rs = df.corr(meth) - assert isna(rs.values).all() - - def test_corr_int(self): - # dtypes other than float64 #1761 - df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - - df3.cov() - df3.corr() - - @td.skip_if_no_scipy - def test_corr_int_and_boolean(self): - # when dtypes of pandas series are different - # then ndarray will have dtype=object, - # so it need to be properly handled - df = DataFrame({"a": [True, False], "b": [1, 0]}) - - expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) - for meth in ["pearson", "kendall", "spearman"]: - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - result = df.corr(meth) - tm.assert_frame_equal(result, expected) - - def test_corr_cov_independent_index_column(self): - # GH 14617 - df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) - for method in ["cov", "corr"]: - result = getattr(df, method)() - assert result.index is not result.columns - assert result.index.equals(result.columns) - - def test_corr_invalid_method(self): - # GH 22298 - df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " - with pytest.raises(ValueError, match=msg): - df.corr(method="____") - - def test_cov(self, float_frame, float_string_frame): - # min_periods no NAs (corner case) - expected = float_frame.cov() - result = float_frame.cov(min_periods=len(float_frame)) - - tm.assert_frame_equal(expected, result) - - result = float_frame.cov(min_periods=len(float_frame) + 1) - assert isna(result.values).all() - - # with NAs - frame = float_frame.copy() - frame["A"][:5] = np.nan - frame["B"][5:10] = np.nan - result = float_frame.cov(min_periods=len(float_frame) - 8) - expected = float_frame.cov() - expected.loc["A", "B"] = np.nan - expected.loc["B", "A"] = np.nan - - # regular - float_frame["A"][:5] = np.nan - float_frame["B"][:10] = np.nan - cov = float_frame.cov() - - tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) - - # exclude non-numeric types - result = float_string_frame.cov() - expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() - tm.assert_frame_equal(result, expected) - - # Single column frame - df = DataFrame(np.linspace(0.0, 1.0, 10)) - result = df.cov() - expected = DataFrame( - np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns - ) - tm.assert_frame_equal(result, expected) - df.loc[0] = np.nan - result = df.cov() - expected = DataFrame( - np.cov(df.values[1:].T).reshape((1, 1)), - index=df.columns, - columns=df.columns, - ) - tm.assert_frame_equal(result, expected) - - def test_corrwith(self, datetime_frame): - a = datetime_frame - noise = Series(np.random.randn(len(a)), index=a.index) - - b = datetime_frame.add(noise, axis=0) - - # make sure order does not matter - b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) - del b["B"] - - colcorr = a.corrwith(b, axis=0) - tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) - - rowcorr = a.corrwith(b, axis=1) - tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) - - dropped = a.corrwith(b, axis=0, drop=True) - tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) - assert "B" not in dropped - - dropped = a.corrwith(b, axis=1, drop=True) - assert a.index[-1] not in dropped.index - - # non time-series data - index = ["a", "b", "c", "d", "e"] - columns = ["one", "two", "three", "four"] - df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) - df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) - correls = df1.corrwith(df2, axis=1) - for row in index[:4]: - tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - - def test_corrwith_with_objects(self): - df1 = tm.makeTimeDataFrame() - df2 = tm.makeTimeDataFrame() - cols = ["A", "B", "C", "D"] - - df1["obj"] = "foo" - df2["obj"] = "bar" - - result = df1.corrwith(df2) - expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) - tm.assert_series_equal(result, expected) - - result = df1.corrwith(df2, axis=1) - expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) - tm.assert_series_equal(result, expected) - - def test_corrwith_series(self, datetime_frame): - result = datetime_frame.corrwith(datetime_frame["A"]) - expected = datetime_frame.apply(datetime_frame["A"].corr) - - tm.assert_series_equal(result, expected) - - def test_corrwith_matches_corrcoef(self): - df1 = DataFrame(np.arange(10000), columns=["a"]) - df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) - c1 = df1.corrwith(df2)["a"] - c2 = np.corrcoef(df1["a"], df2["a"])[0][1] - - tm.assert_almost_equal(c1, c2) - assert c1 < 1 - - def test_corrwith_mixed_dtypes(self): - # GH 18570 - df = pd.DataFrame( - {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} - ) - s = pd.Series([0, 6, 7, 3]) - result = df.corrwith(s) - corrs = [df["a"].corr(s), df["b"].corr(s)] - expected = pd.Series(data=corrs, index=["a", "b"]) - tm.assert_series_equal(result, expected) - - def test_corrwith_index_intersection(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) - - result = df1.corrwith(df2, drop=True).index.sort_values() - expected = df1.columns.intersection(df2.columns).sort_values() - tm.assert_index_equal(result, expected) - - def test_corrwith_index_union(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) - - result = df1.corrwith(df2, drop=False).index.sort_values() - expected = df1.columns.union(df2.columns).sort_values() - tm.assert_index_equal(result, expected) - - def test_corrwith_dup_cols(self): - # GH 21925 - df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) - df2 = df1.copy() - df2 = pd.concat((df2, df2[0]), axis=1) - - result = df1.corrwith(df2) - expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) - tm.assert_series_equal(result, expected) - - @td.skip_if_no_scipy - def test_corrwith_spearman(self): - # GH 21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df ** 2, method="spearman") - expected = Series(np.ones(len(result))) - tm.assert_series_equal(result, expected) - - @td.skip_if_no_scipy - def test_corrwith_kendall(self): - # GH 21925 - df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df ** 2, method="kendall") - expected = Series(np.ones(len(result))) - tm.assert_series_equal(result, expected) - - # --------------------------------------------------------------------- - # Describe - - def test_bool_describe_in_mixed_frame(self): - df = DataFrame( - { - "string_data": ["a", "b", "c", "d", "e"], - "bool_data": [True, True, False, False, False], - "int_data": [10, 20, 30, 40, 50], - } - ) - - # Integer data are included in .describe() output, - # Boolean and string data are not. - result = df.describe() - expected = DataFrame( - {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - # Top value is a boolean value that is False - result = df.describe(include=["bool"]) - - expected = DataFrame( - {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] - ) - tm.assert_frame_equal(result, expected) - - def test_describe_empty_object(self): - # https://github.com/pandas-dev/pandas/issues/27183 - df = pd.DataFrame({"A": [None, None]}, dtype=object) - result = df.describe() - expected = pd.DataFrame( - {"A": [0, 0, np.nan, np.nan]}, - dtype=object, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - result = df.iloc[:0].describe() - tm.assert_frame_equal(result, expected) - - def test_describe_bool_frame(self): - # GH 13891 - df = pd.DataFrame( - { - "bool_data_1": [False, False, True, True], - "bool_data_2": [False, True, True, True], - } - ) - result = df.describe() - expected = DataFrame( - {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - { - "bool_data": [False, False, True, True, False], - "int_data": [0, 1, 2, 3, 4], - } - ) - result = df.describe() - expected = DataFrame( - {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} - ) - result = df.describe() - expected = DataFrame( - {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, - index=["count", "unique", "top", "freq"], - ) - tm.assert_frame_equal(result, expected) - - def test_describe_categorical(self): - df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=["value"], ascending=True) - df["value_group"] = pd.cut( - df.value, range(0, 10500, 500), right=False, labels=cat_labels - ) - cat = df - - # Categoricals should not show up together with numerical columns - result = cat.describe() - assert len(result.columns) == 1 - - # In a frame, describe() for the cat should be the same as for string - # arrays (count, unique, top, freq) - - cat = Categorical( - ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True - ) - s = Series(cat) - result = s.describe() - expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) - tm.assert_series_equal(result, expected) - - cat = Series(Categorical(["a", "b", "c", "c"])) - df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]}) - result = df3.describe() - tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) - - def test_describe_empty_categorical_column(self): - # GH 26397 - # Ensure the index of an an empty categorical DataFrame column - # also contains (count, unique, top, freq) - df = pd.DataFrame({"empty_col": Categorical([])}) - result = df.describe() - expected = DataFrame( - {"empty_col": [0, 0, np.nan, np.nan]}, - index=["count", "unique", "top", "freq"], - dtype="object", - ) - tm.assert_frame_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2, 0]) - assert np.isnan(result.iloc[3, 0]) - - def test_describe_categorical_columns(self): - # GH 11558 - columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") - df = DataFrame( - { - "int1": [10, 20, 30, 40, 50], - "int2": [10, 20, 30, 40, 50], - "obj": ["A", 0, None, "X", 1], - }, - columns=columns, - ) - result = df.describe() - - exp_columns = pd.CategoricalIndex( - ["int1", "int2"], - categories=["int1", "int2", "obj"], - ordered=True, - name="XXX", - ) - expected = DataFrame( - { - "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], - "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - columns=exp_columns, - ) - - tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, expected.columns.values) - - def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01", "2011-03-01"], - freq="MS", - tz="US/Eastern", - name="XXX", - ) - df = DataFrame( - { - 0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ["A", 0, None, "X", 1], - } - ) - df.columns = columns - result = df.describe() - - exp_columns = pd.DatetimeIndex( - ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" - ) - expected = DataFrame( - { - 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - expected.columns = exp_columns - tm.assert_frame_equal(result, expected) - assert result.columns.freq == "MS" - assert result.columns.tz == expected.columns.tz - - def test_describe_timedelta_values(self): - # GH 6145 - t1 = pd.timedelta_range("1 days", freq="D", periods=5) - t2 = pd.timedelta_range("1 hours", freq="H", periods=5) - df = pd.DataFrame({"t1": t1, "t2": t2}) - - expected = DataFrame( - { - "t1": [ - 5, - pd.Timedelta("3 days"), - df.iloc[:, 0].std(), - pd.Timedelta("1 days"), - pd.Timedelta("2 days"), - pd.Timedelta("3 days"), - pd.Timedelta("4 days"), - pd.Timedelta("5 days"), - ], - "t2": [ - 5, - pd.Timedelta("3 hours"), - df.iloc[:, 1].std(), - pd.Timedelta("1 hours"), - pd.Timedelta("2 hours"), - pd.Timedelta("3 hours"), - pd.Timedelta("4 hours"), - pd.Timedelta("5 hours"), - ], - }, - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - - result = df.describe() - tm.assert_frame_equal(result, expected) - - exp_repr = ( - " t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00" - ) - assert repr(result) == exp_repr - - def test_describe_tz_values(self, tz_naive_fixture): - # GH 21332 - tz = tz_naive_fixture - s1 = Series(range(5)) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({"s1": s1, "s2": s2}) - - expected = DataFrame( - { - "s1": [ - 5, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 2, - 1.581139, - 0, - 1, - 2, - 3, - 4, - ], - "s2": [ - 5, - 5, - s2.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - ], - }, - index=[ - "count", - "unique", - "top", - "freq", - "first", - "last", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ], - ) - result = df.describe(include="all") - tm.assert_frame_equal(result, expected) - - def test_describe_percentiles_integer_idx(self): - # Issue 26660 - df = pd.DataFrame({"x": [1]}) - pct = np.linspace(0, 1, 10 + 1) - result = df.describe(percentiles=pct) - - expected = DataFrame( - {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, - index=[ - "count", - "mean", - "std", - "min", - "0%", - "10%", - "20%", - "30%", - "40%", - "50%", - "60%", - "70%", - "80%", - "90%", - "100%", - "max", - ], - ) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Reductions @@ -1066,13 +465,36 @@ def test_mean_mixed_datetime_numeric(self, tz): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_mean_excludeds_datetimes(self, tz): + def test_mean_excludes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() - expected = pd.Series() + expected = pd.Series(dtype=np.float64) + tm.assert_series_equal(result, expected) + + def test_mean_mixed_string_decimal(self): + # GH 11670 + # possible bug when calculating mean of DataFrame? + + d = [ + {"A": 2, "B": None, "C": Decimal("628.00")}, + {"A": 1, "B": None, "C": Decimal("383.00")}, + {"A": 3, "B": None, "C": Decimal("651.00")}, + {"A": 2, "B": None, "C": Decimal("575.00")}, + {"A": 4, "B": None, "C": Decimal("1114.00")}, + {"A": 1, "B": "TEST", "C": Decimal("241.00")}, + {"A": 2, "B": None, "C": Decimal("572.00")}, + {"A": 4, "B": None, "C": Decimal("609.00")}, + {"A": 3, "B": None, "C": Decimal("820.00")}, + {"A": 5, "B": None, "C": Decimal("1223.00")}, + ] + + df = pd.DataFrame(d) + + result = df.mean() + expected = pd.Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) def test_var_std(self, datetime_frame): @@ -1220,7 +642,7 @@ def test_mode_dropna(self, dropna, expected): } ) - result = df[sorted(list(expected.keys()))].mode(dropna=dropna) + result = df[sorted(expected.keys())].mode(dropna=dropna) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) @@ -1471,162 +893,6 @@ def test_sum_bools(self): bools = isna(df) assert bools.sum(axis=1)[0] == 10 - # --------------------------------------------------------------------- - # Cumulative Reductions - cumsum, cummax, ... - - def test_cumsum_corner(self): - dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) - # ?(wesm) - result = dm.cumsum() # noqa - - def test_cumsum(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan - - # axis = 0 - cumsum = datetime_frame.cumsum() - expected = datetime_frame.apply(Series.cumsum) - tm.assert_frame_equal(cumsum, expected) - - # axis = 1 - cumsum = datetime_frame.cumsum(axis=1) - expected = datetime_frame.apply(Series.cumsum, axis=1) - tm.assert_frame_equal(cumsum, expected) - - # works - df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) - result = df.cumsum() # noqa - - # fix issue - cumsum_xs = datetime_frame.cumsum(axis=1) - assert np.shape(cumsum_xs) == np.shape(datetime_frame) - - def test_cumprod(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan - - # axis = 0 - cumprod = datetime_frame.cumprod() - expected = datetime_frame.apply(Series.cumprod) - tm.assert_frame_equal(cumprod, expected) - - # axis = 1 - cumprod = datetime_frame.cumprod(axis=1) - expected = datetime_frame.apply(Series.cumprod, axis=1) - tm.assert_frame_equal(cumprod, expected) - - # fix issue - cumprod_xs = datetime_frame.cumprod(axis=1) - assert np.shape(cumprod_xs) == np.shape(datetime_frame) - - # ints - df = datetime_frame.fillna(0).astype(int) - df.cumprod(0) - df.cumprod(1) - - # ints32 - df = datetime_frame.fillna(0).astype(np.int32) - df.cumprod(0) - df.cumprod(1) - - def test_cummin(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan - - # axis = 0 - cummin = datetime_frame.cummin() - expected = datetime_frame.apply(Series.cummin) - tm.assert_frame_equal(cummin, expected) - - # axis = 1 - cummin = datetime_frame.cummin(axis=1) - expected = datetime_frame.apply(Series.cummin, axis=1) - tm.assert_frame_equal(cummin, expected) - - # it works - df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) - result = df.cummin() # noqa - - # fix issue - cummin_xs = datetime_frame.cummin(axis=1) - assert np.shape(cummin_xs) == np.shape(datetime_frame) - - def test_cummax(self, datetime_frame): - datetime_frame.loc[5:10, 0] = np.nan - datetime_frame.loc[10:15, 1] = np.nan - datetime_frame.loc[15:, 2] = np.nan - - # axis = 0 - cummax = datetime_frame.cummax() - expected = datetime_frame.apply(Series.cummax) - tm.assert_frame_equal(cummax, expected) - - # axis = 1 - cummax = datetime_frame.cummax(axis=1) - expected = datetime_frame.apply(Series.cummax, axis=1) - tm.assert_frame_equal(cummax, expected) - - # it works - df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) - result = df.cummax() # noqa - - # fix issue - cummax_xs = datetime_frame.cummax(axis=1) - assert np.shape(cummax_xs) == np.shape(datetime_frame) - - # --------------------------------------------------------------------- - # Miscellanea - - def test_count(self): - # corner case - frame = DataFrame() - ct1 = frame.count(1) - assert isinstance(ct1, Series) - - ct2 = frame.count(0) - assert isinstance(ct2, Series) - - # GH#423 - df = DataFrame(index=range(10)) - result = df.count(1) - expected = Series(0, index=df.index) - tm.assert_series_equal(result, expected) - - df = DataFrame(columns=range(10)) - result = df.count(0) - expected = Series(0, index=df.columns) - tm.assert_series_equal(result, expected) - - df = DataFrame() - result = df.count() - expected = Series(0, index=[]) - tm.assert_series_equal(result, expected) - - def test_count_objects(self, float_string_frame): - dm = DataFrame(float_string_frame._series) - df = DataFrame(float_string_frame._series) - - tm.assert_series_equal(dm.count(), df.count()) - tm.assert_series_equal(dm.count(1), df.count(1)) - - def test_pct_change(self): - # GH#11150 - pnl = DataFrame( - [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] - ).astype(np.float64) - pnl.iat[1, 0] = np.nan - pnl.iat[1, 1] = np.nan - pnl.iat[2, 3] = 60 - - for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 - result = pnl.pct_change(axis=axis, fill_method="pad") - - tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- # Index of max / min @@ -1864,560 +1130,6 @@ def test_any_all_level_axis_none_raises(self, method): with pytest.raises(ValueError, match=xpr): getattr(df, method)(axis=None, level="out") - # ---------------------------------------------------------------------- - # Isin - - def test_isin(self): - # GH 4211 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - other = ["a", "b", "c"] - - result = df.isin(other) - expected = DataFrame([df.loc[s].isin(other) for s in df.index]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) - def test_isin_empty(self, empty): - # GH 16991 - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - expected = DataFrame(False, df.index, df.columns) - - result = df.isin(empty) - tm.assert_frame_equal(result, expected) - - def test_isin_dict(self): - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - d = {"A": ["a"]} - - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - # non unique columns - df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) - df.columns = ["A", "A"] - expected = DataFrame(False, df.index, df.columns) - expected.loc[0, "A"] = True - result = df.isin(d) - tm.assert_frame_equal(result, expected) - - def test_isin_with_string_scalar(self): - # GH 4763 - df = DataFrame( - { - "vals": [1, 2, 3, 4], - "ids": ["a", "b", "f", "n"], - "ids2": ["a", "n", "c", "n"], - }, - index=["foo", "bar", "baz", "qux"], - ) - with pytest.raises(TypeError): - df.isin("a") - - with pytest.raises(TypeError): - df.isin("aaa") - - def test_isin_df(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) - expected = DataFrame(False, df1.index, df1.columns) - result = df1.isin(df2) - expected["A"].loc[[1, 3]] = True - expected["B"].loc[[0, 2]] = True - tm.assert_frame_equal(result, expected) - - # partial overlapping columns - df2.columns = ["A", "C"] - result = df1.isin(df2) - expected["B"] = False - tm.assert_frame_equal(result, expected) - - def test_isin_tuples(self): - # GH 16394 - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) - df["C"] = list(zip(df["A"], df["B"])) - result = df["C"].isin([(1, "a")]) - tm.assert_series_equal(result, Series([True, False, False], name="C")) - - def test_isin_df_dupe_values(self): - df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) - # just cols duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) - with pytest.raises(ValueError): - df1.isin(df2) - - # just index duped - df2 = DataFrame( - [[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=["A", "B"], - index=[0, 0, 1, 1], - ) - with pytest.raises(ValueError): - df1.isin(df2) - - # cols and index: - df2.columns = ["B", "B"] - with pytest.raises(ValueError): - df1.isin(df2) - - def test_isin_dupe_self(self): - other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) - df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) - result = df.isin(other) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected.loc[0] = True - expected.iloc[1, 1] = True - tm.assert_frame_equal(result, expected) - - def test_isin_against_series(self): - df = pd.DataFrame( - {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] - ) - s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) - expected = DataFrame(False, index=df.index, columns=df.columns) - expected["A"].loc["a"] = True - expected.loc["d"] = True - result = df.isin(s) - tm.assert_frame_equal(result, expected) - - def test_isin_multiIndex(self): - idx = MultiIndex.from_tuples( - [ - (0, "a", "foo"), - (0, "a", "bar"), - (0, "b", "bar"), - (0, "b", "baz"), - (2, "a", "foo"), - (2, "a", "bar"), - (2, "c", "bar"), - (2, "c", "baz"), - (1, "b", "foo"), - (1, "b", "bar"), - (1, "c", "bar"), - (1, "c", "baz"), - ] - ) - df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) - df2 = DataFrame( - { - "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], - "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], - } - ) - # against regular index - expected = DataFrame(False, index=df1.index, columns=df1.columns) - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - df2.index = idx - expected = df2.values.astype(np.bool) - expected[:, 1] = ~expected[:, 1] - expected = DataFrame(expected, columns=["A", "B"], index=idx) - - result = df1.isin(df2) - tm.assert_frame_equal(result, expected) - - def test_isin_empty_datetimelike(self): - # GH 15473 - df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) - df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) - df2 = DataFrame({"date": []}) - df3 = DataFrame() - - expected = DataFrame({"date": [False, False]}) - - result = df1_ts.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_ts.isin(df3) - tm.assert_frame_equal(result, expected) - - result = df1_td.isin(df2) - tm.assert_frame_equal(result, expected) - result = df1_td.isin(df3) - tm.assert_frame_equal(result, expected) - - # --------------------------------------------------------------------- - # Rounding - - def test_round(self): - # GH 2665 - - # Test that rounding an empty DataFrame does nothing - df = DataFrame() - tm.assert_frame_equal(df, df.round()) - - # Here's the test frame we'll be working with - df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) - - # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) - tm.assert_frame_equal(df.round(), expected_rounded) - - # Round with an integer - decimals = 2 - expected_rounded = DataFrame( - {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} - ) - tm.assert_frame_equal(df.round(decimals), expected_rounded) - - # This should also work with np.round (since np.round dispatches to - # df.round) - tm.assert_frame_equal(np.round(df, decimals), expected_rounded) - - # Round with a list - round_list = [1, 2] - with pytest.raises(TypeError): - df.round(round_list) - - # Round with a dictionary - expected_rounded = DataFrame( - {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} - ) - round_dict = {"col1": 1, "col2": 2} - tm.assert_frame_equal(df.round(round_dict), expected_rounded) - - # Incomplete dict - expected_partially_rounded = DataFrame( - {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} - ) - partial_round_dict = {"col2": 1} - tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) - - # Dict with unknown elements - wrong_round_dict = {"col3": 2, "col2": 1} - tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) - - # float input to `decimals` - non_int_round_dict = {"col1": 1, "col2": 0.5} - with pytest.raises(TypeError): - df.round(non_int_round_dict) - - # String input - non_int_round_dict = {"col1": 1, "col2": "foo"} - with pytest.raises(TypeError): - df.round(non_int_round_dict) - - non_int_round_Series = Series(non_int_round_dict) - with pytest.raises(TypeError): - df.round(non_int_round_Series) - - # List input - non_int_round_dict = {"col1": 1, "col2": [1, 2]} - with pytest.raises(TypeError): - df.round(non_int_round_dict) - - non_int_round_Series = Series(non_int_round_dict) - with pytest.raises(TypeError): - df.round(non_int_round_Series) - - # Non integer Series inputs - non_int_round_Series = Series(non_int_round_dict) - with pytest.raises(TypeError): - df.round(non_int_round_Series) - - non_int_round_Series = Series(non_int_round_dict) - with pytest.raises(TypeError): - df.round(non_int_round_Series) - - # Negative numbers - negative_round_dict = {"col1": -1, "col2": -2} - big_df = df * 100 - expected_neg_rounded = DataFrame( - {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} - ) - tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) - - # nan in Series round - nan_round_Series = Series({"col1": np.nan, "col2": 1}) - - # TODO(wesm): unused? - expected_nan_round = DataFrame( # noqa - {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} - ) - - with pytest.raises(TypeError): - df.round(nan_round_Series) - - # Make sure this doesn't break existing Series.round - tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) - - # named columns - # GH 11986 - decimals = 2 - expected_rounded = DataFrame( - {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} - ) - df.columns.name = "cols" - expected_rounded.columns.name = "cols" - tm.assert_frame_equal(df.round(decimals), expected_rounded) - - # interaction of named columns & series - tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) - tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) - - def test_numpy_round(self): - # GH 12600 - df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) - out = np.round(df, decimals=0) - expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) - tm.assert_frame_equal(out, expected) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.round(df, decimals=0, out=df) - - def test_numpy_round_nan(self): - # See gh-14197 - df = Series([1.53, np.nan, 0.06]).to_frame() - with tm.assert_produces_warning(None): - result = df.round() - expected = Series([2.0, np.nan, 0.0]).to_frame() - tm.assert_frame_equal(result, expected) - - def test_round_mixed_type(self): - # GH 11885 - df = DataFrame( - { - "col1": [1.1, 2.2, 3.3, 4.4], - "col2": ["1", "a", "c", "f"], - "col3": date_range("20111111", periods=4), - } - ) - round_0 = DataFrame( - { - "col1": [1.0, 2.0, 3.0, 4.0], - "col2": ["1", "a", "c", "f"], - "col3": date_range("20111111", periods=4), - } - ) - tm.assert_frame_equal(df.round(), round_0) - tm.assert_frame_equal(df.round(1), df) - tm.assert_frame_equal(df.round({"col1": 1}), df) - tm.assert_frame_equal(df.round({"col1": 0}), round_0) - tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) - tm.assert_frame_equal(df.round({"col3": 1}), df) - - def test_round_issue(self): - # GH 11611 - - df = pd.DataFrame( - np.random.random([3, 3]), - columns=["A", "B", "C"], - index=["first", "second", "third"], - ) - - dfs = pd.concat((df, df), axis=1) - rounded = dfs.round() - tm.assert_index_equal(rounded.index, dfs.index) - - decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) - msg = "Index of decimals must be unique" - with pytest.raises(ValueError, match=msg): - df.round(decimals) - - def test_built_in_round(self): - # GH 11763 - # Here's the test frame we'll be working with - df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) - - # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) - tm.assert_frame_equal(round(df), expected_rounded) - - def test_round_nonunique_categorical(self): - # See GH21809 - idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) - df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) - - expected = df.round(3) - expected.index = idx - - df_categorical = df.copy().set_index(idx) - assert df_categorical.shape == (6, 3) - result = df_categorical.round(3) - assert result.shape == (6, 3) - - tm.assert_frame_equal(result, expected) - - # --------------------------------------------------------------------- - # Clip - - def test_clip(self, float_frame): - median = float_frame.median().median() - original = float_frame.copy() - - with tm.assert_produces_warning(FutureWarning): - capped = float_frame.clip_upper(median) - assert not (capped.values > median).any() - - with tm.assert_produces_warning(FutureWarning): - floored = float_frame.clip_lower(median) - assert not (floored.values < median).any() - - double = float_frame.clip(upper=median, lower=median) - assert not (double.values != median).any() - - # Verify that float_frame was not changed inplace - assert (float_frame.values == original.values).all() - - def test_inplace_clip(self, float_frame): - # GH 15388 - median = float_frame.median().median() - frame_copy = float_frame.copy() - - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_upper(median, inplace=True) - assert not (frame_copy.values > median).any() - frame_copy = float_frame.copy() - - with tm.assert_produces_warning(FutureWarning): - frame_copy.clip_lower(median, inplace=True) - assert not (frame_copy.values < median).any() - frame_copy = float_frame.copy() - - frame_copy.clip(upper=median, lower=median, inplace=True) - assert not (frame_copy.values != median).any() - - def test_dataframe_clip(self): - # GH 2747 - df = DataFrame(np.random.randn(1000, 2)) - - for lb, ub in [(-1, 1), (1, -1)]: - clipped_df = df.clip(lb, ub) - - lb, ub = min(lb, ub), max(ub, lb) - lb_mask = df.values <= lb - ub_mask = df.values >= ub - mask = ~lb_mask & ~ub_mask - assert (clipped_df.values[lb_mask] == lb).all() - assert (clipped_df.values[ub_mask] == ub).all() - assert (clipped_df.values[mask] == df.values[mask]).all() - - def test_clip_mixed_numeric(self): - # TODO(jreback) - # clip on mixed integer or floats - # with integer clippers coerces to float - df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) - result = df.clip(1, 2) - expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) - tm.assert_frame_equal(result, expected, check_like=True) - - # GH 24162, clipping now preserves numeric types per column - df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) - expected = df.dtypes - result = df.clip(upper=3).dtypes - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("inplace", [True, False]) - def test_clip_against_series(self, inplace): - # GH 6966 - - df = DataFrame(np.random.randn(1000, 2)) - lb = Series(np.random.randn(1000)) - ub = lb + 1 - - original = df.copy() - clipped_df = df.clip(lb, ub, axis=0, inplace=inplace) - - if inplace: - clipped_df = df - - for i in range(2): - lb_mask = original.iloc[:, i] <= lb - ub_mask = original.iloc[:, i] >= ub - mask = ~lb_mask & ~ub_mask - - result = clipped_df.loc[lb_mask, i] - tm.assert_series_equal(result, lb[lb_mask], check_names=False) - assert result.name == i - - result = clipped_df.loc[ub_mask, i] - tm.assert_series_equal(result, ub[ub_mask], check_names=False) - assert result.name == i - - tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) - @pytest.mark.parametrize( - "axis,res", - [ - (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), - (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), - ], - ) - def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): - # GH 15390 - original = simple_frame.copy(deep=True) - - result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - - expected = pd.DataFrame(res, columns=original.columns, index=original.index) - if inplace: - result = original - tm.assert_frame_equal(result, expected, check_exact=True) - - @pytest.mark.parametrize("axis", [0, 1, None]) - def test_clip_against_frame(self, axis): - df = DataFrame(np.random.randn(1000, 2)) - lb = DataFrame(np.random.randn(1000, 2)) - ub = lb + 1 - - clipped_df = df.clip(lb, ub, axis=axis) - - lb_mask = df <= lb - ub_mask = df >= ub - mask = ~lb_mask & ~ub_mask - - tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) - tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) - tm.assert_frame_equal(clipped_df[mask], df[mask]) - - def test_clip_against_unordered_columns(self): - # GH 20911 - df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) - df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) - df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) - result_upper = df1.clip(lower=0, upper=df2) - expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) - result_lower = df1.clip(lower=df3, upper=3) - expected_lower = df1.clip(lower=df3[df1.columns], upper=3) - result_lower_upper = df1.clip(lower=df3, upper=df2) - expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) - tm.assert_frame_equal(result_upper, expected_upper) - tm.assert_frame_equal(result_lower, expected_lower) - tm.assert_frame_equal(result_lower_upper, expected_lower_upper) - - def test_clip_with_na_args(self, float_frame): - """Should process np.nan argument as None """ - # GH 17276 - tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) - tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) - - # GH 19992 - df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) - - result = df.clip(lower=[4, 5, np.nan], axis=0) - expected = DataFrame( - {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} - ) - tm.assert_frame_equal(result, expected) - - result = df.clip(lower=[4, 5, np.nan], axis=1) - expected = DataFrame( - {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} - ) - tm.assert_frame_equal(result, expected) - # --------------------------------------------------------------------- # Matrix-like @@ -2533,198 +1245,8 @@ def test_matmul(self): with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) - -@pytest.fixture -def df_duplicates(): - return pd.DataFrame( - {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, - index=[0, 0, 1, 1, 1], - ) - - -@pytest.fixture -def df_strings(): - return pd.DataFrame( - { - "a": np.random.permutation(10), - "b": list(ascii_lowercase[:10]), - "c": np.random.permutation(10).astype("float64"), - } - ) - - -@pytest.fixture -def df_main_dtypes(): - return pd.DataFrame( - { - "group": [1, 1, 2], - "int": [1, 2, 3], - "float": [4.0, 5.0, 6.0], - "string": list("abc"), - "category_string": pd.Series(list("abc")).astype("category"), - "category_int": [7, 8, 9], - "datetime": pd.date_range("20130101", periods=3), - "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), - "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), - }, - columns=[ - "group", - "int", - "float", - "string", - "category_string", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ], - ) - - -class TestNLargestNSmallest: - - dtype_error_msg_template = ( - "Column {column!r} has dtype {dtype}, cannot " - "use method {method!r} with this dtype" - ) - - # ---------------------------------------------------------------------- - # Top / bottom - @pytest.mark.parametrize( - "order", - [ - ["a"], - ["c"], - ["a", "b"], - ["a", "c"], - ["b", "a"], - ["b", "c"], - ["a", "b", "c"], - ["c", "a", "b"], - ["c", "b", "a"], - ["b", "c", "a"], - ["b", "a", "c"], - # dups! - ["b", "c", "c"], - ], - ) - @pytest.mark.parametrize("n", range(1, 11)) - def test_n(self, df_strings, nselect_method, n, order): - # GH 10393 - df = df_strings - if "b" in order: - - error_msg = self.dtype_error_msg_template.format( - column="b", method=nselect_method, dtype="object" - ) - with pytest.raises(TypeError, match=error_msg): - getattr(df, nselect_method)(n, order) - else: - ascending = nselect_method == "nsmallest" - result = getattr(df, nselect_method)(n, order) - expected = df.sort_values(order, ascending=ascending).head(n) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "columns", [["group", "category_string"], ["group", "string"]] - ) - def test_n_error(self, df_main_dtypes, nselect_method, columns): - df = df_main_dtypes - col = columns[1] - error_msg = self.dtype_error_msg_template.format( - column=col, method=nselect_method, dtype=df[col].dtype - ) - # escape some characters that may be in the repr - error_msg = ( - error_msg.replace("(", "\\(") - .replace(")", "\\)") - .replace("[", "\\[") - .replace("]", "\\]") - ) - with pytest.raises(TypeError, match=error_msg): - getattr(df, nselect_method)(2, columns) - - def test_n_all_dtypes(self, df_main_dtypes): - df = df_main_dtypes - df.nsmallest(2, list(set(df) - {"category_string", "string"})) - df.nlargest(2, list(set(df) - {"category_string", "string"})) - - @pytest.mark.parametrize( - "method,expected", - [ - ( - "nlargest", - pd.DataFrame( - {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] - ), - ), - ( - "nsmallest", - pd.DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] - ), - ), - ], - ) - def test_duplicates_on_starter_columns(self, method, expected): - # regression test for #22752 - - df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) - - result = getattr(df, method)(4, columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - - def test_n_identical_values(self): - # GH 15297 - df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) - - result = df.nlargest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(3, "a") - expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "order", - [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], - ) - @pytest.mark.parametrize("n", range(1, 6)) - def test_n_duplicate_index(self, df_duplicates, n, order): - # GH 13412 - - df = df_duplicates - result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) - tm.assert_frame_equal(result, expected) - - result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) - tm.assert_frame_equal(result, expected) - - def test_duplicate_keep_all_ties(self): - # GH 16818 - df = pd.DataFrame( - {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} - ) - result = df.nlargest(4, "a", keep="all") - expected = pd.DataFrame( - { - "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, - } - ) - tm.assert_frame_equal(result, expected) - - result = df.nsmallest(2, "a", keep="all") - expected = pd.DataFrame( - { - "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, - } - ) - tm.assert_frame_equal(result, expected) + # --------------------------------------------------------------------- + # Unsorted def test_series_broadcasting(self): # smoke test for numpy warnings @@ -2735,34 +1257,6 @@ def test_series_broadcasting(self): s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): - with tm.assert_produces_warning(FutureWarning): - df_nan.clip_lower(s, axis=0) + df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) - - def test_series_nat_conversion(self): - # GH 18521 - # Check rank does not mutate DataFrame - df = DataFrame(np.random.randn(10, 3), dtype="float64") - expected = df.copy() - df.rank() - result = df - tm.assert_frame_equal(result, expected) - - def test_multiindex_column_lookup(self): - # Check whether tuples are correctly treated as multi-level lookups. - # GH 23033 - df = pd.DataFrame( - columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), - data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], - ) - - # nsmallest - result = df.nsmallest(3, ("x", "a")) - expected = df.iloc[[2, 0, 3]] - tm.assert_frame_equal(result, expected) - - # nlargest - result = df.nlargest(3, ("x", "b")) - expected = df.iloc[[3, 2, 1]] - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 50b1dec21c549..9263409f7a7f8 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,36 +5,15 @@ import numpy as np import pytest +from pandas.compat import PY37 +from pandas.util._test_decorators import async_mark + import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Series, - SparseDtype, - compat, - date_range, - timedelta_range, -) -import pandas.util.testing as tm - - -class SharedWithSparse: - """ - A collection of tests DataFrame and SparseDataFrame can share. - - In generic tests on this class, use ``self._assert_frame_equal()`` and - ``self._assert_series_equal()`` which are implemented in sub-classes - and dispatch correctly. - """ - - def _assert_frame_equal(self, left, right): - """Dispatch to frame class dependent assertion""" - raise NotImplementedError - - def _assert_series_equal(self, left, right): - """Dispatch to series class dependent assertion""" - raise NotImplementedError +from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range +import pandas._testing as tm + +class TestDataFrameMisc: def test_copy_index_name_checking(self, float_frame): # don't want to be able to modify the index stored elsewhere after # making a copy @@ -141,16 +120,16 @@ def test_tab_completion(self): def test_not_hashable(self): empty_frame = DataFrame() - df = self.klass([1]) - msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be hashed" + df = DataFrame([1]) + msg = "'DataFrame' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(df) with pytest.raises(TypeError, match=msg): hash(empty_frame) def test_new_empty_index(self): - df1 = self.klass(np.random.randn(0, 3)) - df2 = self.klass(np.random.randn(0, 3)) + df1 = DataFrame(np.random.randn(0, 3)) + df2 = DataFrame(np.random.randn(0, 3)) df1.index.name = "foo" assert df2.index.name is None @@ -161,7 +140,7 @@ def test_array_interface(self, float_frame): assert result.index is float_frame.index assert result.columns is float_frame.columns - self._assert_frame_equal(result, float_frame.apply(np.sqrt)) + tm.assert_frame_equal(result, float_frame.apply(np.sqrt)) def test_get_agg_axis(self, float_frame): cols = float_frame._get_agg_axis(0) @@ -187,9 +166,9 @@ def test_nonzero(self, float_frame, float_string_frame): assert not df.empty def test_iteritems(self): - df = self.klass([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) for k, v in df.items(): - assert isinstance(v, self.klass._constructor_sliced) + assert isinstance(v, DataFrame._constructor_sliced) def test_items(self): # GH 17213, GH 13918 @@ -206,15 +185,15 @@ def test_iter(self, float_frame): def test_iterrows(self, float_frame, float_string_frame): for k, v in float_frame.iterrows(): exp = float_frame.loc[k] - self._assert_series_equal(v, exp) + tm.assert_series_equal(v, exp) for k, v in float_string_frame.iterrows(): exp = float_string_frame.loc[k] - self._assert_series_equal(v, exp) + tm.assert_series_equal(v, exp) def test_iterrows_iso8601(self): # GH 19671 - s = self.klass( + s = DataFrame( { "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], "iso8601": date_range("2000-01-01", periods=4, freq="M"), @@ -222,7 +201,7 @@ def test_iterrows_iso8601(self): ) for k, v in s.iterrows(): exp = s.loc[k] - self._assert_series_equal(v, exp) + tm.assert_series_equal(v, exp) def test_iterrows_corner(self): # gh-12222 @@ -248,19 +227,19 @@ def test_iterrows_corner(self): def test_itertuples(self, float_frame): for i, tup in enumerate(float_frame.itertuples()): - s = self.klass._constructor_sliced(tup[1:]) + s = DataFrame._constructor_sliced(tup[1:]) s.name = tup[0] expected = float_frame.iloc[i, :].reset_index(drop=True) - self._assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) - df = self.klass( + df = DataFrame( {"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"] ) for tup in df.itertuples(index=False): assert isinstance(tup[1], int) - df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) + df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]}) dfaa = df[["a", "a"]] assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] @@ -285,8 +264,27 @@ def test_itertuples(self, float_frame): df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) + if PY37: + assert hasattr(tup3, "_fields") + else: + assert not hasattr(tup3, "_fields") + + # GH 28282 + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert hasattr(result_254_columns, "_fields") + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 + if PY37: + assert hasattr(result_255_columns, "_fields") + else: + assert not hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): @@ -315,7 +313,7 @@ def test_sequence_like_with_categorical(self): def test_len(self, float_frame): assert len(float_frame) == len(float_frame.index) - def test_values(self, float_frame, float_string_frame): + def test_values_mixed_dtypes(self, float_frame, float_string_frame): frame = float_frame arr = frame.values @@ -332,7 +330,7 @@ def test_values(self, float_frame, float_string_frame): arr = float_string_frame[["foo", "A"]].values assert arr[0, 0] == "bar" - df = self.klass({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) + df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) arr = df.values assert arr[0, 0] == 1j @@ -372,17 +370,17 @@ def test_transpose(self, float_frame): # mixed type index, data = tm.getMixedTypeDict() - mixed = self.klass(data, index=index) + mixed = DataFrame(data, index=index) mixed_T = mixed.T for col, s in mixed_T.items(): assert s.dtype == np.object_ def test_swapaxes(self): - df = self.klass(np.random.randn(10, 5)) - self._assert_frame_equal(df.T, df.swapaxes(0, 1)) - self._assert_frame_equal(df.T, df.swapaxes(1, 0)) - self._assert_frame_equal(df, df.swapaxes(0, 0)) + df = DataFrame(np.random.randn(10, 5)) + tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) + tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) + tm.assert_frame_equal(df, df.swapaxes(0, 0)) msg = ( "No axis named 2 for object type" r" " @@ -413,7 +411,7 @@ def test_more_values(self, float_string_frame): assert values.shape[1] == len(float_string_frame.columns) def test_repr_with_mi_nat(self, float_string_frame): - df = self.klass( + df = DataFrame( {"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]] ) result = repr(df) @@ -430,18 +428,18 @@ def test_series_put_names(self, float_string_frame): assert v.name == k def test_empty_nonzero(self): - df = self.klass([1, 2, 3]) + df = DataFrame([1, 2, 3]) assert not df.empty - df = self.klass(index=[1], columns=[1]) + df = DataFrame(index=[1], columns=[1]) assert not df.empty - df = self.klass(index=["a", "b"], columns=["c", "d"]).dropna() + df = DataFrame(index=["a", "b"], columns=["c", "d"]).dropna() assert df.empty assert df.T.empty empty_frames = [ - self.klass(), - self.klass(index=[1]), - self.klass(columns=[1]), - self.klass({1: []}), + DataFrame(), + DataFrame(index=[1]), + DataFrame(columns=[1]), + DataFrame({1: []}), ] for df in empty_frames: assert df.empty @@ -449,7 +447,7 @@ def test_empty_nonzero(self): def test_with_datetimelikes(self): - df = self.klass( + df = DataFrame( { "A": date_range("20130101", periods=10), "B": timedelta_range("1 day", periods=10), @@ -458,32 +456,13 @@ def test_with_datetimelikes(self): t = df.T result = t.dtypes.value_counts() - if self.klass is DataFrame: - expected = Series({np.dtype("object"): 10}) - else: - expected = Series({SparseDtype(dtype=object): 10}) + expected = Series({np.dtype("object"): 10}) tm.assert_series_equal(result, expected) - -class TestDataFrameMisc(SharedWithSparse): - - klass = DataFrame - # SharedWithSparse tests use generic, klass-agnostic assertion - _assert_frame_equal = staticmethod(tm.assert_frame_equal) - _assert_series_equal = staticmethod(tm.assert_series_equal) - def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() - def test_as_matrix_deprecated(self, float_frame): - # GH 18458 - with tm.assert_produces_warning(FutureWarning): - cols = float_frame.columns.tolist() - result = float_frame.as_matrix(columns=cols) - expected = float_frame.values - tm.assert_numpy_array_equal(result, expected) - def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) series = cp["A"] @@ -561,19 +540,22 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; df = pd.DataFrame()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("df.", 1)) - def test_get_values_deprecated(self): - df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) - with tm.assert_produces_warning(FutureWarning): - res = df.get_values() - tm.assert_numpy_array_equal(res, df.values) + def test_attrs(self): + df = pd.DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["version"] = 1 + + result = df.rename(columns=str) + assert result.attrs == {"version": 1} diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 16d17b04423b7..e98f74e133ea9 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -11,9 +11,10 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply -import pandas.util.testing as tm +from pandas.core.base import SpecificationError @pytest.fixture @@ -104,13 +105,15 @@ def test_apply_with_reduce_empty(self): result = empty_frame.apply(x.append, axis=1, result_type="expand") tm.assert_frame_equal(result, empty_frame) result = empty_frame.apply(x.append, axis=1, result_type="reduce") - tm.assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) empty_with_cols = DataFrame(columns=["a", "b", "c"]) result = empty_with_cols.apply(x.append, axis=1, result_type="expand") tm.assert_frame_equal(result, empty_with_cols) result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") - tm.assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) + expected = Series([], index=pd.Index([], dtype=object), dtype=np.float64) + tm.assert_series_equal(result, expected) # Ensure that x.append hasn't been called assert x == [] @@ -133,7 +136,7 @@ def test_nunique_empty(self): tm.assert_series_equal(result, expected) result = df.T.nunique() - expected = Series([], index=pd.Index([])) + expected = Series([], index=pd.Index([]), dtype=np.float64) tm.assert_series_equal(result, expected) def test_apply_standard_nonunique(self): @@ -423,12 +426,9 @@ def transform2(row): row["D"] = 7 return row - try: + msg = "'float' object has no attribute 'startswith'" + with pytest.raises(AttributeError, match=msg): data.apply(transform, axis=1) - except AttributeError as e: - assert len(e.args) == 2 - assert e.args[1] == "occurred at index 4" - assert e.args[0] == "'float' object has no attribute 'startswith'" def test_apply_bug(self): @@ -644,7 +644,7 @@ def test_applymap_box(self): } ) - result = df.applymap(lambda x: "{0}".format(x.__class__.__name__)) + result = df.applymap(lambda x: type(x).__name__) expected = pd.DataFrame( { "a": ["Timestamp", "Timestamp"], @@ -691,6 +691,18 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) + def test_apply_nested_result_axis_1(self): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + + df = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where @@ -1097,7 +1109,8 @@ def test_agg_dict_nested_renaming_depr(self): df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): @@ -1262,22 +1275,39 @@ def test_non_callable_aggregates(self): assert result == expected + def test_agg_listlike_result(self): + # GH-29587 user defined function returning list-likes + df = DataFrame( + {"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]} + ) + + def func(group_col): + return list(group_col.dropna().unique()) + + result = df.agg(func) + expected = pd.Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"]) + tm.assert_series_equal(result, expected) + + result = df.agg([func]) + expected = expected.to_frame("func").T + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "df, func, expected", chain( _get_cython_table_params( DataFrame(), [ - ("sum", Series()), - ("max", Series()), - ("min", Series()), + ("sum", Series(dtype="float64")), + ("max", Series(dtype="float64")), + ("min", Series(dtype="float64")), ("all", Series(dtype=bool)), ("any", Series(dtype=bool)), - ("mean", Series()), - ("prod", Series()), - ("std", Series()), - ("var", Series()), - ("median", Series()), + ("mean", Series(dtype="float64")), + ("prod", Series(dtype="float64")), + ("std", Series(dtype="float64")), + ("var", Series(dtype="float64")), + ("median", Series(dtype="float64")), ], ), _get_cython_table_params( @@ -1313,8 +1343,8 @@ def test_agg_cython_table(self, df, func, expected, axis): _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ - ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), - ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), @@ -1323,6 +1353,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) @@ -1359,3 +1393,14 @@ def test_apply_datetime_tz_issue(self): expected = pd.Series(index=timestamps, data=timestamps) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("df", [pd.DataFrame({"A": ["a", None], "B": ["c", "d"]})]) + @pytest.mark.parametrize("method", ["min", "max", "sum"]) + def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, method): + # GH 16832 + none_in_first_column_result = getattr(df[["A", "B"]], method)() + none_in_second_column_result = getattr(df[["B", "A"]], method)() + + tm.assert_series_equal( + none_in_first_column_result, none_in_second_column_result + ) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 88bd5a4fedfae..659b55756c4b6 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int -import pandas.util.testing as tm # ------------------------------------------------------------------- # Comparisons @@ -16,6 +16,13 @@ class TestFrameComparisons: # Specifically _not_ flex-comparisons + def test_frame_in_list(self): + # GH#12689 this should raise at the DataFrame level, not blocks + df = pd.DataFrame(np.random.randn(6, 4), columns=list("ABCD")) + msg = "The truth value of a DataFrame is ambiguous" + with pytest.raises(ValueError, match=msg): + df in [None] + def test_comparison_invalid(self): def check(df, df2): @@ -470,7 +477,7 @@ def test_arith_flex_series(self, simple_frame): def test_arith_flex_zero_len_raises(self): # GH 19522 passing fill_value to frame flex arith methods should # raise even in the zero-length special cases - ser_len0 = pd.Series([]) + ser_len0 = pd.Series([], dtype=object) df_len0 = pd.DataFrame(columns=["A", "B"]) df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) @@ -719,3 +726,14 @@ def test_zero_len_frame_with_series_corner_cases(): result = df + ser expected = df tm.assert_frame_equal(result, expected) + + +def test_frame_single_columns_object_sum_axis_1(): + # GH 13758 + data = { + "One": pd.Series(["A", 1.2, np.nan]), + } + df = pd.DataFrame(data) + result = df.sum(axis=1) + expected = pd.Series(["A", 1.2, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 16dfae847e0eb..7effa98fd8213 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSelectReindex: @@ -929,6 +929,14 @@ def test_filter_corner(self): result = empty.filter(like="foo") tm.assert_frame_equal(result, empty) + def test_filter_regex_non_string(self): + # GH#5798 trying to filter on non-string columns should drop, + # not raise + df = pd.DataFrame(np.random.random((3, 2)), columns=["STRING", 123]) + result = df.filter(regex="STRING") + expected = df[["STRING"]] + tm.assert_frame_equal(result, expected) + def test_take(self, float_frame): # homogeneous order = [3, 1, 2, 0] diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b45c074f179a0..d301ed969789e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -15,10 +15,10 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray, integer_array from pandas.core.internals import ObjectBlock from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm # Segregated collection of methods that require the BlockManager internal data # structure @@ -313,10 +313,7 @@ def test_copy_blocks(self, float_frame): column = df.columns[0] # use the default copy=True, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks() + blocks = df._to_dict_of_blocks(copy=True) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 @@ -330,10 +327,7 @@ def test_no_copy_blocks(self, float_frame): column = df.columns[0] # use the copy=False, change a column - - # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - blocks = df.as_blocks(copy=False) + blocks = df._to_dict_of_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: _df.loc[:, column] = _df[column] + 1 @@ -591,10 +585,6 @@ def test_strange_column_corruption_issue(self): df = DataFrame(index=[0, 1]) df[0] = np.nan wasCol = {} - # uncommenting these makes the results match - # for col in xrange(100, 200): - # wasCol[col] = 1 - # df[col] = np.nan for i, dt in enumerate(df.index): for col in range(100, 200): @@ -621,12 +611,12 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.array([1, 2, None, 3]) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) df2 = pd.DataFrame( { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], - "c": pd.array([1, 2, None, 3]), + "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) assert type(df["c"]._data.blocks[0]) == ObjectBlock diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index e3f37e1ef3186..9bad54b051d6c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameConcatCommon: @@ -128,152 +128,6 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) - def test_append_series_dict(self): - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - series = df.loc[4] - msg = "Indexes have overlapping values" - with pytest.raises(ValueError, match=msg): - df.append(series, verify_integrity=True) - - series.name = None - msg = "Can only append a Series if ignore_index=True" - with pytest.raises(TypeError, match=msg): - df.append(series, verify_integrity=True) - - result = df.append(series[::-1], ignore_index=True) - expected = df.append( - DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True - ) - tm.assert_frame_equal(result, expected) - - # dict - result = df.append(series.to_dict(), ignore_index=True) - tm.assert_frame_equal(result, expected) - - result = df.append(series[::-1][:3], ignore_index=True) - expected = df.append( - DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True - ) - tm.assert_frame_equal(result, expected.loc[:, result.columns]) - - # can append when name set - row = df.loc[4] - row.name = 5 - result = df.append(row) - expected = df.append(df[-1:], ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_append_list_of_series_dicts(self): - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - dicts = [x.to_dict() for idx, x in df.iterrows()] - - result = df.append(dicts, ignore_index=True) - expected = df.append(df, ignore_index=True) - tm.assert_frame_equal(result, expected) - - # different columns - dicts = [ - {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, - {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, - ] - result = df.append(dicts, ignore_index=True, sort=True) - expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - def test_append_missing_cols(self): - # GH22252 - # exercise the conditional branch in append method where the data - # to be appended is a list and does not contain all columns that are in - # the target DataFrame - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - dicts = [{"foo": 9}, {"bar": 10}] - with tm.assert_produces_warning(None): - result = df.append(dicts, ignore_index=True, sort=True) - - expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - def test_append_empty_dataframe(self): - - # Empty df append empty df - df1 = DataFrame() - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Non-empty df append empty df - df1 = DataFrame(np.random.randn(5, 2)) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Empty df with columns append empty df - df1 = DataFrame(columns=["bar", "foo"]) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Non-Empty df with columns append empty df - df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - def test_append_dtypes(self): - - # GH 5754 - # row appends of different dtypes (so need to do by-item) - # can sometimes infer the correct type - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": np.nan}, index=range(1)) - df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) - result = df1.append(df2) - expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) - tm.assert_frame_equal(result, expected) - def test_update(self): df = DataFrame( [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] @@ -355,13 +209,6 @@ def test_update_raise_on_overlap(self): with pytest.raises(ValueError, match="Data overlaps"): df.update(other, errors="raise") - @pytest.mark.parametrize("raise_conflict", [True, False]) - def test_update_deprecation(self, raise_conflict): - df = DataFrame([[1.5, 1, 3.0]]) - other = DataFrame() - with tm.assert_produces_warning(FutureWarning): - df.update(other, raise_conflict=raise_conflict) - def test_update_from_non_df(self): d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} df = DataFrame(d) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index aa00cf234d9ee..ea1e339f44d93 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1,5 +1,5 @@ from collections import OrderedDict, abc -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta import functools import itertools @@ -8,9 +8,8 @@ import numpy.ma.mrecords as mrecords import pytest -from pandas.compat import PY36, is_platform_little_endian +from pandas.compat import is_platform_little_endian -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -26,7 +25,9 @@ date_range, isna, ) -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.arrays import IntervalArray, PeriodArray, SparseArray +from pandas.core.construction import create_series_with_explicit_dtype MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] MIXED_INT_DTYPES = [ @@ -42,6 +43,19 @@ class TestDataFrameConstructors: + def test_series_with_name_not_matching_column(self): + # GH#9232 + x = pd.Series(range(5), name=1) + y = pd.Series(range(5), name=0) + + result = pd.DataFrame(x, columns=[0]) + expected = pd.DataFrame([], columns=[0]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(y, columns=[1]) + expected = pd.DataFrame([], columns=[1]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "constructor", [ @@ -245,9 +259,9 @@ def test_constructor_overflow_int64(self): np.array([2 ** 64], dtype=object), np.array([2 ** 65]), [2 ** 64 + 1], - np.array([-2 ** 63 - 4], dtype=object), - np.array([-2 ** 64 - 1]), - [-2 ** 65 - 2], + np.array([-(2 ** 63) - 4], dtype=object), + np.array([-(2 ** 64) - 1]), + [-(2 ** 65) - 2], ], ) def test_constructor_int_overflow(self, values): @@ -387,7 +401,6 @@ def test_constructor_dict_nan_tuple_key(self, value): result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY36, reason="Insertion order for Python>=3.6") def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) @@ -399,18 +412,6 @@ def test_constructor_dict_order_insertion(self): expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) - @pytest.mark.skipif(PY36, reason="order by value for Python<3.6") - def test_constructor_dict_order_by_values(self): - datetime_series = tm.makeTimeSeries(nper=30) - datetime_series_short = tm.makeTimeSeries(nper=25) - - # GH19018 - # initialization ordering: by value if python<3.6 - d = {"b": datetime_series_short, "a": datetime_series} - frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list("ab")) - tm.assert_frame_equal(frame, expected) - def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame @@ -478,11 +479,11 @@ def test_constructor_error_msgs(self): DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) @@ -510,17 +511,17 @@ def test_constructor_with_embedded_frames(self): result = df2.loc[1, 0] tm.assert_frame_equal(result, df1 + 10) - def test_constructor_subclass_dict(self, float_frame): + def test_constructor_subclass_dict(self, float_frame, dict_subclass): # Test for passing dict subclass to constructor data = { - "col1": tm.TestSubDict((x, 10.0 * x) for x in range(10)), - "col2": tm.TestSubDict((x, 20.0 * x) for x in range(10)), + "col1": dict_subclass((x, 10.0 * x) for x in range(10)), + "col2": dict_subclass((x, 20.0 * x) for x in range(10)), } df = DataFrame(data) refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) - data = tm.TestSubDict(data.items()) + data = dict_subclass(data.items()) df = DataFrame(data) tm.assert_frame_equal(refdf, df) @@ -1230,7 +1231,9 @@ def test_constructor_list_of_series(self): OrderedDict([["a", 1.5], ["b", 3], ["c", 4]]), OrderedDict([["b", 3], ["c", 4], ["d", 6]]), ] - data = [Series(d) for d in data] + data = [ + create_series_with_explicit_dtype(d, dtype_if_empty=object) for d in data + ] result = DataFrame(data) sdict = OrderedDict(zip(range(len(data)), data)) @@ -1240,7 +1243,7 @@ def test_constructor_list_of_series(self): result2 = DataFrame(data, index=np.arange(6)) tm.assert_frame_equal(result, result2) - result = DataFrame([Series()]) + result = DataFrame([Series(dtype=object)]) expected = DataFrame(index=[0]) tm.assert_frame_equal(result, expected) @@ -1373,7 +1376,7 @@ def test_constructor_list_of_dict_order(self): } ) result = DataFrame(data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_constructor_orient(self, float_string_frame): data_dict = float_string_frame.T._series @@ -1424,6 +1427,23 @@ def test_from_dict_columns_parameter(self): dict([("A", [1, 2]), ("B", [4, 5])]), columns=["one", "two"] ) + @pytest.mark.parametrize( + "data_dict, keys", + [ + ([{("a",): 1}, {("a",): 2}], [("a",)]), + ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)]), + ([{("a", "b"): 1}], [("a", "b")]), + ], + ) + def test_constructor_from_dict_tuples(self, data_dict, keys): + # GH 16769 + df = DataFrame.from_dict(data_dict) + + result = df.columns + expected = Index(keys, dtype="object", tupleize_cols=False) + + tm.assert_index_equal(result, expected) + def test_constructor_Series_named(self): a = Series([1, 2, 3], index=["a", "b", "c"], name="x") df = DataFrame(a) @@ -1447,7 +1467,7 @@ def test_constructor_Series_named(self): DataFrame(s, columns=[1, 2]) # #2234 - a = Series([], name="x") + a = Series([], name="x", dtype=object) df = DataFrame(a) assert df.columns[0] == "x" @@ -1504,92 +1524,6 @@ def test_constructor_manager_resize(self, float_frame): tm.assert_index_equal(result.index, Index(index)) tm.assert_index_equal(result.columns, Index(columns)) - def test_constructor_from_items(self, float_frame, float_string_frame): - items = [(c, float_frame[c]) for c in float_frame.columns] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items) - tm.assert_frame_equal(recons, float_frame) - - # pass some columns - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items(items, columns=["C", "B", "A"]) - tm.assert_frame_equal(recons, float_frame.loc[:, ["C", "B", "A"]]) - - # orient='index' - - row_items = [ - (idx, float_string_frame.xs(idx)) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert recons["A"].dtype == np.float64 - - msg = "Must pass columns with orient='index'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items(row_items, orient="index") - - # orient='index', but thar be tuples - arr = construct_1d_object_array_from_listlike( - [("bar", "baz")] * len(float_string_frame) - ) - float_string_frame["foo"] = arr - row_items = [ - (idx, list(float_string_frame.xs(idx))) for idx in float_string_frame.index - ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - recons = DataFrame.from_items( - row_items, columns=float_string_frame.columns, orient="index" - ) - tm.assert_frame_equal(recons, float_string_frame) - assert isinstance(recons["foo"][0], tuple) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rs = DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - orient="index", - columns=["one", "two", "three"], - ) - xp = DataFrame( - [[1, 2, 3], [4, 5, 6]], index=["A", "B"], columns=["one", "two", "three"] - ) - tm.assert_frame_equal(rs, xp) - - def test_constructor_from_items_scalars(self): - # GH 17312 - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", 1), ("B", 4)]) - - msg = ( - r"The value in each \(key, value\) " - "pair must be an array, Series, or dict" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", 1), ("B", 2)], columns=["col1"], orient="index" - ) - - def test_from_items_deprecation(self): - # GH 17320 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items([("A", [1, 2, 3]), ("B", [4, 5, 6])]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - DataFrame.from_items( - [("A", [1, 2, 3]), ("B", [4, 5, 6])], - columns=["col1", "col2", "col3"], - orient="index", - ) - def test_constructor_mix_series_nonseries(self, float_frame): df = DataFrame( {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"] @@ -1791,7 +1725,7 @@ def test_constructor_with_datetimes(self): # preserver an index with a tz on dict construction i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") - expected = DataFrame({"a": i.to_series(keep_tz=True).reset_index(drop=True)}) + expected = DataFrame({"a": i.to_series().reset_index(drop=True)}) df = DataFrame() df["a"] = i tm.assert_frame_equal(df, expected) @@ -1802,20 +1736,27 @@ def test_constructor_with_datetimes(self): # multiples i_no_tz = date_range("1/1/2011", periods=5, freq="10s") df = DataFrame({"a": i, "b": i_no_tz}) - expected = DataFrame( - {"a": i.to_series(keep_tz=True).reset_index(drop=True), "b": i_no_tz} - ) + expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz}) tm.assert_frame_equal(df, expected) - def test_constructor_datetimes_with_nulls(self): - # gh-15869 - for arr in [ + @pytest.mark.parametrize( + "arr", + [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]), - ]: - result = DataFrame(arr).dtypes - expected = Series([np.dtype("datetime64[ns]")]) - tm.assert_series_equal(result, expected) + [[np.datetime64("NaT")], [None]], + [[np.datetime64("NaT")], [pd.NaT]], + [[None], [np.datetime64("NaT")]], + [[None], [pd.NaT]], + [[pd.NaT], [np.datetime64("NaT")]], + [[pd.NaT], [None]], + ], + ) + def test_constructor_datetimes_with_nulls(self, arr): + # gh-15869, GH#11220 + result = DataFrame(arr).dtypes + expected = Series([np.dtype("datetime64[ns]")]) + tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays @@ -2441,11 +2382,11 @@ def test_from_records_series_list_dict(self): def test_to_frame_with_falsey_names(self): # GH 16114 - result = Series(name=0).to_frame().dtypes - expected = Series({0: np.float64}) + result = Series(name=0, dtype=object).to_frame().dtypes + expected = Series({0: object}) tm.assert_series_equal(result, expected) - result = DataFrame(Series(name=0)).dtypes + result = DataFrame(Series(name=0, dtype=object)).dtypes tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) @@ -2469,6 +2410,29 @@ class List(list): result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])])) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "extension_arr", + [ + Categorical(list("aabbc")), + SparseArray([1, np.nan, np.nan, np.nan]), + IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), + PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")), + ], + ) + def test_constructor_with_extension_array(self, extension_arr): + # GH11363 + expected = DataFrame(Series(extension_arr)) + result = DataFrame(extension_arr) + tm.assert_frame_equal(result, expected) + + def test_datetime_date_tuple_columns_from_dict(self): + # GH 10863 + v = date.today() + tup = v, v + result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) + expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup]))) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): @@ -2595,3 +2559,11 @@ def test_from_tzaware_mixed_object_array(self): "datetime64[ns, CET]", ] assert (res.dtypes == expected_dtypes).all() + + def test_from_2d_ndarray_with_dtype(self): + # GH#12513 + array_dim2 = np.arange(10).reshape((5, 2)) + df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]") + + expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]") + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py new file mode 100644 index 0000000000000..b545d6aa8afd3 --- /dev/null +++ b/pandas/tests/frame/test_cumulative.py @@ -0,0 +1,135 @@ +""" +Tests for DataFrame cumulative operations + +See also +-------- +tests.series.test_cumulative +""" + +import numpy as np + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFrameCumulativeOps: + # --------------------------------------------------------------------- + # Cumulative Operations - cumsum, cummax, ... + + def test_cumsum_corner(self): + dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) + # TODO(wesm): do something with this? + result = dm.cumsum() # noqa + + def test_cumsum(self, datetime_frame): + datetime_frame.loc[5:10, 0] = np.nan + datetime_frame.loc[10:15, 1] = np.nan + datetime_frame.loc[15:, 2] = np.nan + + # axis = 0 + cumsum = datetime_frame.cumsum() + expected = datetime_frame.apply(Series.cumsum) + tm.assert_frame_equal(cumsum, expected) + + # axis = 1 + cumsum = datetime_frame.cumsum(axis=1) + expected = datetime_frame.apply(Series.cumsum, axis=1) + tm.assert_frame_equal(cumsum, expected) + + # works + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) + df.cumsum() + + # fix issue + cumsum_xs = datetime_frame.cumsum(axis=1) + assert np.shape(cumsum_xs) == np.shape(datetime_frame) + + def test_cumprod(self, datetime_frame): + datetime_frame.loc[5:10, 0] = np.nan + datetime_frame.loc[10:15, 1] = np.nan + datetime_frame.loc[15:, 2] = np.nan + + # axis = 0 + cumprod = datetime_frame.cumprod() + expected = datetime_frame.apply(Series.cumprod) + tm.assert_frame_equal(cumprod, expected) + + # axis = 1 + cumprod = datetime_frame.cumprod(axis=1) + expected = datetime_frame.apply(Series.cumprod, axis=1) + tm.assert_frame_equal(cumprod, expected) + + # fix issue + cumprod_xs = datetime_frame.cumprod(axis=1) + assert np.shape(cumprod_xs) == np.shape(datetime_frame) + + # ints + df = datetime_frame.fillna(0).astype(int) + df.cumprod(0) + df.cumprod(1) + + # ints32 + df = datetime_frame.fillna(0).astype(np.int32) + df.cumprod(0) + df.cumprod(1) + + def test_cummin(self, datetime_frame): + datetime_frame.loc[5:10, 0] = np.nan + datetime_frame.loc[10:15, 1] = np.nan + datetime_frame.loc[15:, 2] = np.nan + + # axis = 0 + cummin = datetime_frame.cummin() + expected = datetime_frame.apply(Series.cummin) + tm.assert_frame_equal(cummin, expected) + + # axis = 1 + cummin = datetime_frame.cummin(axis=1) + expected = datetime_frame.apply(Series.cummin, axis=1) + tm.assert_frame_equal(cummin, expected) + + # it works + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) + df.cummin() + + # fix issue + cummin_xs = datetime_frame.cummin(axis=1) + assert np.shape(cummin_xs) == np.shape(datetime_frame) + + def test_cummax(self, datetime_frame): + datetime_frame.loc[5:10, 0] = np.nan + datetime_frame.loc[10:15, 1] = np.nan + datetime_frame.loc[15:, 2] = np.nan + + # axis = 0 + cummax = datetime_frame.cummax() + expected = datetime_frame.apply(Series.cummax) + tm.assert_frame_equal(cummax, expected) + + # axis = 1 + cummax = datetime_frame.cummax(axis=1) + expected = datetime_frame.apply(Series.cummax, axis=1) + tm.assert_frame_equal(cummax, expected) + + # it works + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) + df.cummax() + + # fix issue + cummax_xs = datetime_frame.cummax(axis=1) + assert np.shape(cummax_xs) == np.shape(datetime_frame) + + def test_cumulative_ops_preserve_dtypes(self): + # GH#19296 dont incorrectly upcast to object + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]}) + + result = df.cumsum() + + expected = DataFrame( + { + "A": Series([1, 3, 6], dtype=np.int64), + "B": Series([1, 3, 6], dtype=np.float64), + "C": df["C"].cumsum(), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 68844aeeb081e..06bb040224455 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, IntervalDtype import pandas as pd from pandas import ( @@ -18,8 +18,8 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import integer_array -import pandas.util.testing as tm def _check_cast(df, v): @@ -46,63 +46,33 @@ def test_concat_empty_dataframe_dtypes(self): assert result["b"].dtype == np.float64 assert result["c"].dtype == np.float64 - def test_empty_frame_dtypes_ftypes(self): + def test_empty_frame_dtypes(self): empty_df = pd.DataFrame() tm.assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) - nocols_df = pd.DataFrame(index=[1, 2, 3]) tm.assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) - norows_df = pd.DataFrame(columns=list("abc")) tm.assert_series_equal( norows_df.dtypes, pd.Series(np.object, index=list("abc")) ) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - norows_df.ftypes, pd.Series("object:dense", index=list("abc")) - ) - norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) tm.assert_series_equal( norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc")) ) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - norows_int_df.ftypes, pd.Series("int32:dense", index=list("abc")) - ) odict = OrderedDict df = pd.DataFrame(odict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) ex_dtypes = pd.Series( odict([("a", np.int64), ("b", np.bool), ("c", np.float64)]) ) - ex_ftypes = pd.Series( - odict([("a", "int64:dense"), ("b", "bool:dense"), ("c", "float64:dense")]) - ) tm.assert_series_equal(df.dtypes, ex_dtypes) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(df.ftypes, ex_ftypes) - # same but for empty slice of df tm.assert_series_equal(df[:0].dtypes, ex_dtypes) - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(df[:0].ftypes, ex_ftypes) - def test_datetime_with_tz_dtypes(self): tzframe = DataFrame( { @@ -474,22 +444,6 @@ def test_dtypes_gh8722(self, float_string_frame): result = df.dtypes tm.assert_series_equal(result, Series({0: np.dtype("int64")})) - def test_ftypes(self, mixed_float_frame): - frame = mixed_float_frame - expected = Series( - dict( - A="float32:dense", - B="float32:dense", - C="float16:dense", - D="float64:dense", - ) - ).sort_values() - - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - result = frame.ftypes.sort_values() - tm.assert_series_equal(result, expected) - def test_astype_float(self, float_frame): casted = float_frame.astype(int) expected = DataFrame( @@ -702,8 +656,8 @@ def test_astype_dict_like(self, dtype_class): # GH 16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame - dt7 = dtype_class({}) - result = df.astype(dt7) + dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) + equiv = df.astype(dt7) tm.assert_frame_equal(df, equiv) tm.assert_frame_equal(df, original) @@ -745,14 +699,7 @@ def test_astype_categorical(self, dtype): expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "cls", - [ - pd.api.types.CategoricalDtype, - pd.api.types.DatetimeTZDtype, - pd.api.types.IntervalDtype, - ], - ) + @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = "Expected an instance of {}".format(cls.__name__) @@ -762,6 +709,15 @@ def test_astype_categoricaldtype_class_raises(self, cls): with pytest.raises(TypeError, match=xpr): df["A"].astype(cls) + def test_singlerow_slice_categoricaldtype_gives_series(self): + # GH29521 + df = pd.DataFrame({"x": pd.Categorical("a b c d e".split())}) + result = df.iloc[0] + raw_cat = pd.Categorical(["a"], categories=["a", "b", "c", "d", "e"]) + expected = pd.Series(raw_cat, index=["x"], name=0, dtype="category") + + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH 22578 @@ -815,6 +771,22 @@ def test_astype_extension_dtypes_duplicate_col(self, dtype): expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("kwargs", [dict(), dict(other=None)]) + def test_df_where_with_category(self, kwargs): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, True], [False, True, True]]) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.A.where(mask[:, 0], **kwargs) + expected = Series(pd.Categorical([0, np.nan], categories=[0, 3]), name="A") + + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] ) @@ -825,6 +797,31 @@ def test_astype_column_metadata(self, dtype): df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) + def test_df_where_change_dtype(self): + # GH 16979 + df = DataFrame(np.arange(2 * 3).reshape(2, 3), columns=list("ABC")) + mask = np.array([[True, False, False], [False, False, True]]) + + result = df.where(mask) + expected = DataFrame( + [[0, np.nan, np.nan], [np.nan, np.nan, 5]], columns=list("ABC") + ) + + tm.assert_frame_equal(result, expected) + + # change type to category + df.A = df.A.astype("category") + df.B = df.B.astype("category") + df.C = df.C.astype("category") + + result = df.where(mask) + A = pd.Categorical([0, np.nan], categories=[0, 3]) + B = pd.Categorical([np.nan, np.nan], categories=[1, 4]) + C = pd.Categorical([np.nan, 5], categories=[2, 5]) + expected = DataFrame({"A": A, "B": B, "C": C}) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): @@ -1063,6 +1060,18 @@ def test_asarray_homogenous(self): expected = np.array([[1, 1], [2, 2]], dtype="object") tm.assert_numpy_array_equal(result, expected) + def test_str_to_small_float_conversion_type(self): + # GH 20388 + np.random.seed(13) + col_data = [str(np.random.random() * 1e-12) for _ in range(5)] + result = pd.DataFrame(col_data, columns=["A"]) + expected = pd.DataFrame(col_data, columns=["A"], dtype=object) + tm.assert_frame_equal(result, expected) + # change the dtype of the elements from object to float one by one + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = pd.DataFrame(col_data, columns=["A"], dtype=float) + tm.assert_frame_equal(result, expected) + class TestDataFrameDatetimeWithTZ: def test_interleave(self, timezone_frame): diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 220968d4b3d29..c6e28f3c64f12 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, period_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture @@ -195,7 +195,7 @@ def test_join_left_sequence_non_unique_index(): tm.assert_frame_equal(joined, expected) -@pytest.mark.parametrize("sort_kw", [True, False, None]) +@pytest.mark.parametrize("sort_kw", [True, False]) def test_suppress_future_warning_with_sort_kw(sort_kw): a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) @@ -213,12 +213,6 @@ def test_suppress_future_warning_with_sort_kw(sort_kw): if sort_kw is False: expected = expected.reindex(index=["c", "a", "b"]) - if sort_kw is None: - # only warn if not explicitly specified - ctx = tm.assert_produces_warning(FutureWarning, check_stacklevel=False) - else: - ctx = tm.assert_produces_warning(None, check_stacklevel=False) - - with ctx: + with tm.assert_produces_warning(None, check_stacklevel=False): result = a.join([b, c], how="outer", sort=sort_kw) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 24510ff9338ca..2e6759cb1a238 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -8,17 +8,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm - - -def _skip_if_no_pchip(): - try: - from scipy.interpolate import pchip_interpolate # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.pchip missing") class TestDataFrameMissingData: @@ -165,23 +156,16 @@ def test_dropna_multiple_axes(self): [7, np.nan, 8, 9], ] ) - cp = df.copy() # GH20987 - with tm.assert_produces_warning(FutureWarning): - result = df.dropna(how="all", axis=[0, 1]) - with tm.assert_produces_warning(FutureWarning): - result2 = df.dropna(how="all", axis=(0, 1)) - expected = df.dropna(how="all").dropna(how="all", axis=1) - - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(df, cp) + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=[0, 1]) + with pytest.raises(TypeError, match="supplying multiple axes"): + df.dropna(how="all", axis=(0, 1)) inp = df.copy() - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(TypeError, match="supplying multiple axes"): inp.dropna(how="all", axis=(0, 1), inplace=True) - tm.assert_frame_equal(inp, expected) def test_dropna_tz_aware_datetime(self): # GH13407 @@ -678,7 +662,7 @@ def test_fillna_invalid_method(self, float_frame): def test_fillna_invalid_value(self, float_frame): # list - msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"' + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' with pytest.raises(TypeError, match=msg.format("list")): float_frame.fillna([1, 2]) # tuple @@ -844,8 +828,6 @@ def test_interp_alt_scipy(self): expectedk["A"] = expected["A"] tm.assert_frame_equal(result, expectedk) - _skip_if_no_pchip() - result = df.interpolate(method="pchip") expected.loc[2, "A"] = 3 expected.loc[5, "A"] = 6.0 @@ -988,3 +970,16 @@ def test_interp_ignore_all_good(self): # all good result = df[["B", "D"]].interpolate(downcast=None) tm.assert_frame_equal(result, df[["B", "D"]]) + + @pytest.mark.parametrize("axis", [0, 1]) + def test_interp_time_inplace_axis(self, axis): + # GH 9687 + periods = 5 + idx = pd.date_range(start="2014-01-01", periods=periods) + data = np.random.rand(periods, periods) + data[data < 0.5] = np.nan + expected = pd.DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + expected.interpolate(axis=0, method="time", inplace=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 7feb55f2fac09..8bc2aa214e035 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -3,10 +3,8 @@ import numpy as np import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm # Column add, remove, delete. @@ -60,10 +58,7 @@ def test_assign_order(self): df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) result = df.assign(D=df.A + df.B, C=df.A - df.B) - if PY36: - expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) - else: - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) tm.assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) @@ -80,25 +75,6 @@ def test_assign_bad(self): with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) - @pytest.mark.skipif( - PY36, - reason="""Issue #14207: valid for python - 3.6 and above""", - ) - def test_assign_dependent_old_python(self): - df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - # Key C does not exist at definition time of df - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) - with pytest.raises(KeyError, match="^'C'$"): - df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) - - @pytest.mark.skipif( - not PY36, - reason="""Issue #14207: not valid for - python 3.5 and below""", - ) def test_assign_dependent(self): df = DataFrame({"A": [1, 2], "B": [3, 4]}) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 8fed695a483f5..32ead406a3e86 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameNonuniqueIndexes: diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 19d91241d6a6b..c727cb398d53e 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm class TestDataFrameUnaryOperators: @@ -218,6 +218,42 @@ def test_logical_with_nas(self): expected = Series([True, True]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(DataFrame(left), DataFrame(right)) + expected = DataFrame(expected) + + tm.assert_frame_equal(result, expected) + class TestDataFrameOperators: @pytest.mark.parametrize( @@ -530,6 +566,16 @@ def test_comp(func): test_comp(operator.ge) test_comp(operator.le) + def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): + # GH 11565 + df = DataFrame( + {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]} + ) + + f = getattr(operator, compare_operators_no_eq_ne) + with pytest.raises(TypeError): + f(df, 0) + def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() missing_df.iloc[0]["A"] = np.nan @@ -842,44 +888,3 @@ def test_no_warning(self, all_arithmetic_operators): b = df["B"] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b, 0) - - -class TestTranspose: - def test_transpose_tzaware_1col_single_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - - df = pd.DataFrame(dti) - assert (df.dtypes == dti.dtype).all() - res = df.T - assert (res.dtypes == dti.dtype).all() - - def test_transpose_tzaware_2col_single_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - - df3 = pd.DataFrame({"A": dti, "B": dti}) - assert (df3.dtypes == dti.dtype).all() - res3 = df3.T - assert (res3.dtypes == dti.dtype).all() - - def test_transpose_tzaware_2col_mixed_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - dti2 = dti.tz_convert("US/Pacific") - - df4 = pd.DataFrame({"A": dti, "B": dti2}) - assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() - assert (df4.T.dtypes == object).all() - tm.assert_frame_equal(df4.T.T, df4) - - def test_transpose_object_to_tzaware_mixed_tz(self): - # GH#26825 - dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - dti2 = dti.tz_convert("US/Pacific") - - # mixed all-tzaware dtypes - df2 = pd.DataFrame([dti, dti2]) - assert (df2.dtypes == object).all() - res2 = df2.T - assert (res2.dtypes == [dti.dtype, dti2.dtype]).all() diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index a545db3365e36..a6b2b334d3ec8 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -14,7 +14,7 @@ period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 04d27f4c12c59..703e05998e93c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -8,8 +8,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.computation.check import _NUMEXPR_INSTALLED -import pandas.util.testing as tm PARSERS = "python", "pandas" ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @@ -27,7 +27,7 @@ def engine(request): def skip_if_no_pandas_parser(parser): if parser != "pandas": - pytest.skip("cannot evaluate with parser {0!r}".format(parser)) + pytest.skip(f"cannot evaluate with parser {repr(parser)}") class TestCompat: @@ -479,11 +479,13 @@ def test_query_scope(self): tm.assert_frame_equal(res, expected) # no local variable c - with pytest.raises(UndefinedVariableError): + with pytest.raises( + UndefinedVariableError, match="local variable 'c' is not defined" + ): df.query("@a > b > @c", engine=engine, parser=parser) # no column named 'c' - with pytest.raises(UndefinedVariableError): + with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"): df.query("@a > b > c", engine=engine, parser=parser) def test_query_doesnt_pickup_local(self): @@ -494,7 +496,7 @@ def test_query_doesnt_pickup_local(self): df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) # we don't pick up the local 'sin' - with pytest.raises(UndefinedVariableError): + with pytest.raises(UndefinedVariableError, match="name 'sin' is not defined"): df.query("sin > 5", engine=engine, parser=parser) def test_query_builtin(self): @@ -588,7 +590,7 @@ def test_nested_raises_on_local_self_reference(self): df = DataFrame(np.random.randn(5, 3)) # can't reference ourself b/c we're a local so @ is necessary - with pytest.raises(UndefinedVariableError): + with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"): df.query("df > 0", engine=self.engine, parser=self.parser) def test_local_syntax(self): @@ -651,9 +653,9 @@ def test_query_undefined_local(self): skip_if_no_pandas_parser(parser) df = DataFrame(np.random.rand(10, 2), columns=list("ab")) - msg = "local variable 'c' is not defined" - - with pytest.raises(UndefinedVariableError, match=msg): + with pytest.raises( + UndefinedVariableError, match="local variable 'c' is not defined" + ): df.query("a == @c", engine=engine, parser=parser) def test_index_resolvers_come_after_columns_with_the_same_name(self): @@ -784,7 +786,7 @@ def test_nested_scope(self): with pytest.raises(SyntaxError): df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) - with pytest.raises(UndefinedVariableError): + with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"): df.query("(df>0) & (df2>0)", engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] @@ -991,7 +993,7 @@ def test_query_lex_compare_strings(self, parser, engine): ops = {"<": operator.lt, ">": operator.gt, "<=": operator.le, ">=": operator.ge} for op, func in ops.items(): - res = df.query('X %s "d"' % op, engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1046,13 +1048,35 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting: @pytest.fixture(scope="class") def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ yield DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], "C C": [4, 5, 6], + "C C": [7, 4, 3], "C_C": [8, 9, 10], "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], + 1: [5, 7, 9], } ) @@ -1091,7 +1115,64 @@ def test_mixed_underscores_and_spaces(self, df): expect = df["A"] + df["D_D D"] tm.assert_series_equal(res, expect) - def backtick_quote_name_with_no_spaces(self, df): + def test_backtick_quote_name_with_no_spaces(self, df): res = df.eval("A + `C_C`") expect = df["A"] + df["C_C"] tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4") diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 318b1c6add91e..05bdec4a3a4d2 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -3,6 +3,7 @@ import re import sys import textwrap +import warnings import numpy as np import pytest @@ -18,7 +19,7 @@ option_context, period_range, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt @@ -29,17 +30,17 @@ class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty - foo = repr(DataFrame()) # noqa + repr(DataFrame()) # empty with index frame = DataFrame(index=np.arange(1000)) - foo = repr(frame) # noqa + repr(frame) def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed - foo = repr(float_string_frame) # noqa + repr(float_string_frame) float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow @@ -51,13 +52,13 @@ def test_repr_mixed_big(self): biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan - foo = repr(biggie) # noqa + repr(biggie) def test_repr(self, float_frame): buf = StringIO() # small one - foo = repr(float_frame) + repr(float_frame) float_frame.info(verbose=False, buf=buf) # even smaller @@ -68,7 +69,7 @@ def test_repr(self, float_frame): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) - foo = repr(no_index) # noqa + repr(no_index) # no columns or index DataFrame().info(buf=buf) @@ -97,7 +98,6 @@ def test_repr_big(self): def test_repr_unsortable(self, float_frame): # columns are not sortable - import warnings warn_filters = warnings.filters warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") @@ -205,6 +205,28 @@ def test_info(self, float_frame, datetime_frame): frame.info() frame.info(verbose=False) + def test_info_verbose(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + index = i - start + line_nr = " {} ".format(index) + assert line.startswith(line_nr) + def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) @@ -218,7 +240,9 @@ def test_info_memory(self): RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): - a 2 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format( @@ -262,8 +286,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self): frame.info(buf=io) io.seek(0) lines = io.readlines() - assert "a 1 non-null int64\n" == lines[3] - assert "a 1 non-null float64\n" == lines[4] + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] def test_info_shows_column_dtypes(self): dtypes = [ @@ -283,13 +307,20 @@ def test_info_shows_column_dtypes(self): buf = StringIO() df.info(buf=buf) res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res for i, dtype in enumerate(dtypes): - name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype) + name = " {i:d} {i:d} {n:d} non-null {dtype}".format( + i=i, n=n, dtype=dtype + ) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (10, True)]: + for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() @@ -297,16 +328,16 @@ def test_info_max_cols(self): res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, verbose in [(10, None), (5, False), (10, True)]: + for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols no exceeded + # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, max_cols in [(10, 5), (5, 4)]: + for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5d2c115ce8eb5..56a0c8cf4f5bd 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameReshape: @@ -699,7 +699,7 @@ def verify(df): for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) - right = sorted(list(map(cast, right))) + right = sorted(map(cast, right)) assert left == right df = DataFrame( @@ -1128,3 +1128,34 @@ def test_stack_timezone_aware_values(): ), ) tm.assert_series_equal(result, expected) + + +def test_unstacking_multi_index_df(): + # see gh-30740 + df = DataFrame( + { + "name": ["Alice", "Bob"], + "score": [9.5, 8], + "employed": [False, True], + "kids": [0, 0], + "gender": ["female", "male"], + } + ) + df = df.set_index(["name", "employed", "kids", "gender"]) + df = df.unstack(["gender"], fill_value=0) + expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0) + result = df.unstack(["employed", "kids"], fill_value=0) + expected = DataFrame( + [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]], + index=Index(["Alice", "Bob"], name="name"), + columns=MultiIndex.from_tuples( + [ + ("score", "female", False, 0), + ("score", "female", True, 0), + ("score", "male", False, 0), + ("score", "male", True, 0), + ], + names=[None, "gender", "employed", "kids"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index b0287d9180859..40526ab27ac9a 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index e1e546256f7cd..4a436d70dc48f 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSubclassing: diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index cad1fd60ca2a9..e89f4ee07ea00 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -12,12 +12,11 @@ Index, MultiIndex, Series, - Timestamp, date_range, period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets @@ -28,174 +27,6 @@ def close_open_fixture(request): class TestDataFrameTimeSeriesMethods: - def test_diff(self, datetime_frame): - the_diff = datetime_frame.diff(1) - - tm.assert_series_equal( - the_diff["A"], datetime_frame["A"] - datetime_frame["A"].shift(1) - ) - - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - rs = DataFrame({"s": s}).diff() - assert rs.s[1] == 1 - - # mixed numeric - tf = datetime_frame.astype("float32") - the_diff = tf.diff(1) - tm.assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) - - # issue 10907 - df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) - df.insert(0, "x", 1) - result = df.diff(axis=1) - expected = pd.DataFrame( - {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} - ).astype("float64") - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_diff_datetime_axis0(self, tz): - # GH 18578 - df = DataFrame( - { - 0: date_range("2010", freq="D", periods=2, tz=tz), - 1: date_range("2010", freq="D", periods=2, tz=tz), - } - ) - - result = df.diff(axis=0) - expected = DataFrame( - { - 0: pd.TimedeltaIndex(["NaT", "1 days"]), - 1: pd.TimedeltaIndex(["NaT", "1 days"]), - } - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_diff_datetime_axis1(self, tz): - # GH 18578 - df = DataFrame( - { - 0: date_range("2010", freq="D", periods=2, tz=tz), - 1: date_range("2010", freq="D", periods=2, tz=tz), - } - ) - if tz is None: - result = df.diff(axis=1) - expected = DataFrame( - { - 0: pd.TimedeltaIndex(["NaT", "NaT"]), - 1: pd.TimedeltaIndex(["0 days", "0 days"]), - } - ) - tm.assert_frame_equal(result, expected) - else: - with pytest.raises(NotImplementedError): - result = df.diff(axis=1) - - def test_diff_timedelta(self): - # GH 4533 - df = DataFrame( - dict( - time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], - value=[1.0, 2.0], - ) - ) - - res = df.diff() - exp = DataFrame( - [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] - ) - tm.assert_frame_equal(res, exp) - - def test_diff_mixed_dtype(self): - df = DataFrame(np.random.randn(5, 3)) - df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) - - result = df.diff() - assert result[0].dtype == np.float64 - - def test_diff_neg_n(self, datetime_frame): - rs = datetime_frame.diff(-1) - xp = datetime_frame - datetime_frame.shift(-1) - tm.assert_frame_equal(rs, xp) - - def test_diff_float_n(self, datetime_frame): - rs = datetime_frame.diff(1.0) - xp = datetime_frame.diff(1) - tm.assert_frame_equal(rs, xp) - - def test_diff_axis(self): - # GH 9727 - df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) - tm.assert_frame_equal( - df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]]) - ) - tm.assert_frame_equal( - df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]]) - ) - - def test_pct_change(self, datetime_frame): - rs = datetime_frame.pct_change(fill_method=None) - tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) - - rs = datetime_frame.pct_change(2) - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_frame.pct_change(fill_method="bfill", limit=1) - filled = datetime_frame.fillna(method="bfill", limit=1) - tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_frame.pct_change(freq="5D") - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - df = DataFrame({"a": s, "b": s}) - - chg = df.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - edf = DataFrame({"a": expected, "b": expected}) - tm.assert_frame_equal(chg, edf) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, datetime_frame, freq, periods, fill_method, limit - ): - # GH 7292 - rs_freq = datetime_frame.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_frame.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_frame_equal(rs_freq, rs_periods) - - empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_frame_equal(rs_freq, rs_periods) - def test_frame_ctor_datetime64_column(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) @@ -249,267 +80,6 @@ def test_frame_append_datetime64_col_other_units(self): assert (tmp["dates"].values == ex_vals).all() - def test_shift(self, datetime_frame, int_frame): - # naive shift - shiftedFrame = datetime_frame.shift(5) - tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) - - shiftedSeries = datetime_frame["A"].shift(5) - tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) - - shiftedFrame = datetime_frame.shift(-5) - tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) - - shiftedSeries = datetime_frame["A"].shift(-5) - tm.assert_series_equal(shiftedFrame["A"], shiftedSeries) - - # shift by 0 - unshifted = datetime_frame.shift(0) - tm.assert_frame_equal(unshifted, datetime_frame) - - # shift by DateOffset - shiftedFrame = datetime_frame.shift(5, freq=offsets.BDay()) - assert len(shiftedFrame) == len(datetime_frame) - - shiftedFrame2 = datetime_frame.shift(5, freq="B") - tm.assert_frame_equal(shiftedFrame, shiftedFrame2) - - d = datetime_frame.index[0] - shifted_d = d + offsets.BDay(5) - tm.assert_series_equal( - datetime_frame.xs(d), shiftedFrame.xs(shifted_d), check_names=False - ) - - # shift int frame - int_shifted = int_frame.shift(1) # noqa - - # Shifting with PeriodIndex - ps = tm.makePeriodFrame() - shifted = ps.shift(1) - unshifted = shifted.shift(-1) - tm.assert_index_equal(shifted.index, ps.index) - tm.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal( - unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values - ) - - shifted2 = ps.shift(1, "B") - shifted3 = ps.shift(1, offsets.BDay()) - tm.assert_frame_equal(shifted2, shifted3) - tm.assert_frame_equal(ps, shifted2.shift(-1, "B")) - - msg = "does not match PeriodIndex freq" - with pytest.raises(ValueError, match=msg): - ps.shift(freq="D") - - # shift other axis - # GH 6371 - df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat( - [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], - ignore_index=True, - axis=1, - ) - result = df.shift(1, axis=1) - tm.assert_frame_equal(result, expected) - - # shift named axis - df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat( - [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], - ignore_index=True, - axis=1, - ) - result = df.shift(1, axis="columns") - tm.assert_frame_equal(result, expected) - - def test_shift_bool(self): - df = DataFrame({"high": [True, False], "low": [False, False]}) - rs = df.shift(1) - xp = DataFrame( - np.array([[np.nan, np.nan], [True, False]], dtype=object), - columns=["high", "low"], - ) - tm.assert_frame_equal(rs, xp) - - def test_shift_categorical(self): - # GH 9416 - s1 = pd.Series(["a", "b", "c"], dtype="category") - s2 = pd.Series(["A", "B", "C"], dtype="category") - df = DataFrame({"one": s1, "two": s2}) - rs = df.shift(1) - xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) - tm.assert_frame_equal(rs, xp) - - def test_shift_fill_value(self): - # GH #24128 - df = DataFrame( - [1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H") - ) - exp = DataFrame( - [0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H") - ) - result = df.shift(1, fill_value=0) - tm.assert_frame_equal(result, exp) - - exp = DataFrame( - [0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H") - ) - result = df.shift(2, fill_value=0) - tm.assert_frame_equal(result, exp) - - def test_shift_empty(self): - # Regression test for #8019 - df = DataFrame({"foo": []}) - rs = df.shift(-1) - - tm.assert_frame_equal(df, rs) - - def test_shift_duplicate_columns(self): - # GH 9092; verify that position-based shifting works - # in the presence of duplicate columns - column_lists = [list(range(5)), [1] * 5, [1, 1, 2, 2, 1]] - data = np.random.randn(20, 5) - - shifted = [] - for columns in column_lists: - df = pd.DataFrame(data.copy(), columns=columns) - for s in range(5): - df.iloc[:, s] = df.iloc[:, s].shift(s + 1) - df.columns = range(5) - shifted.append(df) - - # sanity check the base case - nulls = shifted[0].isna().sum() - tm.assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) - - # check all answers are the same - tm.assert_frame_equal(shifted[0], shifted[1]) - tm.assert_frame_equal(shifted[0], shifted[2]) - - def test_tshift(self, datetime_frame): - # PeriodIndex - ps = tm.makePeriodFrame() - shifted = ps.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_frame_equal(unshifted, ps) - - shifted2 = ps.tshift(freq="B") - tm.assert_frame_equal(shifted, shifted2) - - shifted3 = ps.tshift(freq=offsets.BDay()) - tm.assert_frame_equal(shifted, shifted3) - - with pytest.raises(ValueError, match="does not match"): - ps.tshift(freq="M") - - # DatetimeIndex - shifted = datetime_frame.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_frame_equal(datetime_frame, unshifted) - - shifted2 = datetime_frame.tshift(freq=datetime_frame.index.freq) - tm.assert_frame_equal(shifted, shifted2) - - inferred_ts = DataFrame( - datetime_frame.values, - Index(np.asarray(datetime_frame.index)), - columns=datetime_frame.columns, - ) - shifted = inferred_ts.tshift(1) - unshifted = shifted.tshift(-1) - tm.assert_frame_equal(shifted, datetime_frame.tshift(1)) - tm.assert_frame_equal(unshifted, inferred_ts) - - no_freq = datetime_frame.iloc[[0, 5, 7], :] - msg = "Freq was not given and was not set in the index" - with pytest.raises(ValueError, match=msg): - no_freq.tshift() - - def test_truncate(self, datetime_frame): - ts = datetime_frame[::3] - - start, end = datetime_frame.index[3], datetime_frame.index[6] - - start_missing = datetime_frame.index[2] - end_missing = datetime_frame.index[7] - - # neither specified - truncated = ts.truncate() - tm.assert_frame_equal(truncated, ts) - - # both specified - expected = ts[1:3] - - truncated = ts.truncate(start, end) - tm.assert_frame_equal(truncated, expected) - - truncated = ts.truncate(start_missing, end_missing) - tm.assert_frame_equal(truncated, expected) - - # start specified - expected = ts[1:] - - truncated = ts.truncate(before=start) - tm.assert_frame_equal(truncated, expected) - - truncated = ts.truncate(before=start_missing) - tm.assert_frame_equal(truncated, expected) - - # end specified - expected = ts[:3] - - truncated = ts.truncate(after=end) - tm.assert_frame_equal(truncated, expected) - - truncated = ts.truncate(after=end_missing) - tm.assert_frame_equal(truncated, expected) - - msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" - with pytest.raises(ValueError, match=msg): - ts.truncate( - before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq - ) - - def test_truncate_copy(self, datetime_frame): - index = datetime_frame.index - truncated = datetime_frame.truncate(index[5], index[10]) - truncated.values[:] = 5.0 - assert not (datetime_frame.values[5:11] == 5).any() - - def test_truncate_nonsortedindex(self): - # GH 17935 - - df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) - msg = "truncate requires a sorted index" - with pytest.raises(ValueError, match=msg): - df.truncate(before=3, after=9) - - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = pd.DataFrame( - {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng - ) - msg = "truncate requires a sorted index" - with pytest.raises(ValueError, match=msg): - ts.sort_values("A", ascending=False).truncate( - before="2011-11", after="2011-12" - ) - - df = pd.DataFrame( - { - 3: np.random.randn(5), - 20: np.random.randn(5), - 2: np.random.randn(5), - 0: np.random.randn(5), - }, - columns=[3, 20, 2, 0], - ) - msg = "truncate requires a sorted index" - with pytest.raises(ValueError, match=msg): - df.truncate(before=2, after=20, axis=1) - def test_asfreq(self, datetime_frame): offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) rule_monthly = datetime_frame.asfreq("BM") @@ -609,6 +179,14 @@ def test_first_last_valid( assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index() + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_first_valid_index_all_nan(self, klass): + # GH#9752 Series/DataFrame should both return None, not raise + obj = klass([np.nan]) + + assert obj.first_valid_index() is None + assert obj.iloc[:0].first_valid_index() is None + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq="12h") result = ts.first("10d") diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index 26ab4ff0ded85..b60f2052a988f 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -11,8 +11,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestDataFrameTimezones: diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index ad058faff96e7..aeff92971b42a 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -18,10 +18,10 @@ read_csv, to_datetime, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm -from pandas.io.common import _get_handle +from pandas.io.common import get_handle MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] MIXED_INT_DTYPES = [ @@ -1065,7 +1065,7 @@ def test_to_csv_compression(self, df, encoding, compression): tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle( + f, _handles = get_handle( filename, "w", compression=compression, encoding=encoding ) with f: diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 270a7c70a2e81..7fe22e77c5bf3 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -196,7 +196,7 @@ def test_set_attribute(self): def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") df = DataFrame( { "a": list("abc"), @@ -222,11 +222,10 @@ def test_to_xarray_index_types(self, index): # idempotency # categoricals are not preserved - # datetimes w/tz are not preserved + # datetimes w/tz are preserved # column names are lost expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal( result.to_dataframe(), @@ -271,7 +270,6 @@ def test_to_xarray(self): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index a7506f3d60b3c..10a1e09a09bf8 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm # ---------------------------------------------------------------------- # Generic types test cases @@ -33,6 +33,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): if is_scalar(value): if value == "empty": arr = None + dtype = np.float64 # remove the info axis kwargs.pop(self._typ._info_axis_name, None) @@ -124,7 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) + msg = f"The truth value of a {self._typ.__name__} is ambiguous" with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -202,9 +203,9 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = "compound dtypes are not implemented in the {} constructor".format( - self._typ.__name__ - ) + msg = "compound dtypes are not implemented" + f"in the {self._typ.__name__} constructor" + with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) @@ -322,6 +323,7 @@ def test_sample(self): self._compare( o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) ) + self._compare( o.sample(frac=0.7, random_state=seed), o.sample(frac=0.7, random_state=seed), @@ -337,6 +339,15 @@ def test_sample(self): o.sample(frac=0.7, random_state=np.random.RandomState(test)), ) + self._compare( + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + o.sample( + frac=2, replace=True, random_state=np.random.RandomState(test) + ), + ) + os1, os2 = [], [] for _ in range(2): np.random.seed(test) @@ -424,6 +435,17 @@ def test_sample(self): weights_with_None[5] = 0.5 self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + def test_sample_upsampling_without_replacement(self): + # GH27451 + + df = pd.DataFrame({"A": list("abc")}) + msg = ( + "Replace has to be set to `True` when " + "upsampling the population `frac` > 1." + ) + with pytest.raises(ValueError, match=msg): + df.sample(frac=2, replace=False) + def test_size_compat(self): # GH8846 # size property should be defined @@ -711,13 +733,10 @@ def test_squeeze(self): tm.assert_series_equal(df.squeeze(), df["A"]) # don't fail with 0 length dimensions GH11229 & GH8999 - empty_series = Series([], name="five") + empty_series = Series([], name="five", dtype=np.float64) empty_frame = DataFrame([empty_series]) - - [ - tm.assert_series_equal(empty_series, higher_dim.squeeze()) - for higher_dim in [empty_series, empty_frame] - ] + tm.assert_series_equal(empty_series, empty_series.squeeze()) + tm.assert_series_equal(empty_series, empty_frame.squeeze()) # axis argument df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] @@ -801,6 +820,18 @@ def test_take_invalid_kwargs(self): with pytest.raises(ValueError, match=msg): obj.take(indices, mode="clip") + def test_depr_take_kwarg_is_copy(self): + # GH 27357 + df = DataFrame({"A": [1, 2, 3]}) + msg = ( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future." + ) + with tm.assert_produces_warning(FutureWarning) as w: + df.take([0, 1], is_copy=True) + + assert w[0].message.args[0] == msg + def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() @@ -877,10 +908,10 @@ def test_equals(self): # GH 8437 a = pd.Series([False, np.nan]) b = pd.Series([False, np.nan]) - c = pd.Series(index=range(2)) - d = pd.Series(index=range(2)) - e = pd.Series(index=range(2)) - f = pd.Series(index=range(2)) + c = pd.Series(index=range(2), dtype=object) + d = c.copy() + e = c.copy() + f = c.copy() c[:-1] = d[:-1] = e[0] = f[0] = False assert a.equals(a) assert a.equals(b) @@ -919,7 +950,7 @@ def test_pipe_tuple_error(self): @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) def test_axis_classmethods(self, box): - obj = box() + obj = box(dtype=object) values = ( list(box._AXIS_NAMES.keys()) + list(box._AXIS_NUMBERS.keys()) @@ -929,23 +960,3 @@ def test_axis_classmethods(self, box): assert obj._get_axis_number(v) == box._get_axis_number(v) assert obj._get_axis_name(v) == box._get_axis_name(v) assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) - - def test_deprecated_to_dense(self): - # GH 26557: DEPR - # Deprecated 0.25.0 - - df = pd.DataFrame({"A": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning): - result = df.to_dense() - tm.assert_frame_equal(result, df) - - ser = pd.Series([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - result = ser.to_dense() - tm.assert_series_equal(result, ser) - - def test_deprecated_get_dtype_counts(self): - # GH 18262 - df = DataFrame([1]) - with tm.assert_produces_warning(FutureWarning): - df.get_dtype_counts() diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index ae452e6faef01..8ad8355f2d530 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -205,7 +205,7 @@ def finalize(self, other, method=None, **kwargs): def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") s = Series(range(6), index=index(6)) s.index.name = "foo" result = s.to_xarray() @@ -224,7 +224,7 @@ def test_to_xarray_index_types(self, index): def test_to_xarray(self): from xarray import DataArray - s = Series([]) + s = Series([], dtype=object) s.index.name = "foo" result = s.to_xarray() assert len(result) == 0 @@ -243,11 +243,6 @@ def test_to_xarray(self): assert isinstance(result, DataArray) tm.assert_series_equal(result.to_series(), s) - def test_valid_deprecated(self): - # GH18800 - with tm.assert_produces_warning(FutureWarning): - pd.Series([]).valid() - @pytest.mark.parametrize( "s", [ diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b56da5fba6f80..0b72a61ed84de 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1,18 +1,17 @@ """ test .agg behavior / note that .apply is tested generally in test_groupby.py """ -from collections import OrderedDict import functools import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm def test_agg_regression1(tsframe): @@ -92,6 +91,25 @@ def test_groupby_aggregation_mixed_dtype(): tm.assert_frame_equal(result, expected) +def test_groupby_aggregation_multi_level_column(): + # GH 29772 + lst = [ + [True, True, True, False], + [True, False, np.nan, False], + [True, True, np.nan, False], + [True, True, np.nan, False], + ] + df = pd.DataFrame( + data=lst, + columns=pd.MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), + ) + + result = df.groupby(level=1, axis=1).sum() + expected = pd.DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + + tm.assert_frame_equal(result, expected) + + def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA grouped = ts.groupby(ts * np.nan) @@ -156,18 +174,14 @@ def test_aggregate_str_func(tsframe, groupbyfunc): tm.assert_frame_equal(result, expected) # group frame by function dict - result = grouped.agg( - OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]]) - ) + result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"}) expected = DataFrame( - OrderedDict( - [ - ["A", grouped["A"].var()], - ["B", grouped["B"].std()], - ["C", grouped["C"].mean()], - ["D", grouped["D"].sem()], - ] - ) + { + "A": grouped["A"].var(), + "B": grouped["B"].std(), + "C": grouped["C"].mean(), + "D": grouped["D"].sem(), + } ) tm.assert_frame_equal(result, expected) @@ -242,22 +256,20 @@ def test_multiple_functions_tuples_and_non_tuples(df): def test_more_flexible_frame_multi_function(df): grouped = df.groupby("A") - exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]])) - exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]])) + exmean = grouped.agg({"C": np.mean, "D": np.mean}) + exstd = grouped.agg({"C": np.std, "D": np.std}) expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]]) + d = {"C": [np.mean, np.std], "D": [np.mean, np.std]} result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # be careful - result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])) - expected = grouped.aggregate( - OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]) - ) + result = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) + expected = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]}) tm.assert_frame_equal(result, expected) def foo(x): @@ -267,16 +279,14 @@ def bar(x): return np.std(x, ddof=1) # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - d = OrderedDict( - [["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]] - ) - result = grouped.aggregate(d) - - d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]]) - expected = grouped.aggregate(d) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + d = dict([["C", np.mean], ["D", dict([["foo", np.mean], ["bar", np.std]])]]) + grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + # But without renaming, these functions are OK + d = {"C": [np.mean], "D": [foo, bar]} + grouped.aggregate(d) def test_multi_function_flexible_mix(df): @@ -284,30 +294,23 @@ def test_multi_function_flexible_mix(df): grouped = df.groupby("A") # Expected - d = OrderedDict( - [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]] - ) + d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}} # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = grouped.aggregate(d) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 1 - d = OrderedDict( - [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]] - ) + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) # Test 2 - d = OrderedDict( - [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]] - ) + d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"} # this uses column selection & renaming - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped.aggregate(d) - tm.assert_frame_equal(result, expected) + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate(d) def test_groupby_agg_coercing_bools(): @@ -350,6 +353,14 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) +def test_func_duplicates_raises(): + # GH28426 + msg = "Function names" + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg(["min", "min"]) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) @@ -361,9 +372,7 @@ def test_series_named_agg(self): tm.assert_frame_equal(result, expected) result = gr.agg(b="min", a="sum") - # sort for 35 and earlier - if compat.PY36: - expected = expected[["b", "a"]] + expected = expected[["b", "a"]] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): @@ -376,12 +385,12 @@ def test_no_args_raises(self): expected = pd.DataFrame() tm.assert_frame_equal(result, expected) - def test_series_named_agg_duplicates_raises(self): - # This is a limitation of the named agg implementation reusing - # aggregate_multiple_funcs. It could maybe be lifted in the future. + def test_series_named_agg_duplicates_no_raises(self): + # GH28426 gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) - with pytest.raises(SpecificationError): - gr.agg(a="sum", b="sum") + grouped = gr.agg(a="sum", b="sum") + expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]}) + tm.assert_frame_equal(expected, grouped) def test_mangled(self): gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) @@ -425,8 +434,6 @@ def test_agg_relabel(self): index=pd.Index(["a", "b"], name="group"), columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) - if not compat.PY36: - expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): @@ -440,12 +447,34 @@ def test_agg_relabel_non_identifier(self): ) tm.assert_frame_equal(result, expected) - def test_duplicate_raises(self): - # TODO: we currently raise on multiple lambdas. We could *maybe* - # update com.get_callable_name to append `_i` to each lambda. + def test_duplicate_no_raises(self): + # GH 28426, if use same input function on same column, + # no error should raise df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) + expected = pd.DataFrame( + {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(grouped, expected) + + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + quant50.__name__ = "quant50" + quant70.__name__ = "quant70" + + test = pd.DataFrame( + {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} + ) + + grouped = test.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + expected = pd.DataFrame( + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, + index=pd.Index(["a", "b"], name="col1"), + ) + tm.assert_frame_equal(grouped, expected) def test_agg_relabel_with_level(self): df = pd.DataFrame( @@ -495,6 +524,86 @@ def test_mangled(self): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", + [ + ( + (("y", "A"), "max"), + (("y", "A"), np.min), + (("y", "B"), "mean"), + [1, 3], + [0, 2], + [5.5, 7.5], + ), + ( + (("y", "A"), lambda x: max(x)), + (("y", "A"), lambda x: 1), + (("y", "B"), "mean"), + [1, 3], + [1, 1], + [5.5, 7.5], + ), + ( + pd.NamedAgg(("y", "A"), "max"), + pd.NamedAgg(("y", "B"), np.mean), + pd.NamedAgg(("y", "A"), lambda x: 1), + [1, 3], + [5.5, 7.5], + [1, 1], + ), + ], +) +def test_agg_relabel_multiindex_column( + agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3 +): + # GH 29422, add tests for multiindex column cases + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + idx = pd.Index(["a", "b"], name=("x", "group")) + + result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max")) + expected = DataFrame({"a_max": [1, 3]}, index=idx) + tm.assert_frame_equal(result, expected) + + result = df.groupby(("x", "group")).agg( + col_1=agg_col1, col_2=agg_col2, col_3=agg_col3 + ) + expected = DataFrame( + {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_relabel_multiindex_raises_not_exist(): + # GH 29422, add test for raises senario when aggregate column does not exist + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + with pytest.raises(KeyError, match="does not exist"): + df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) + + +def test_agg_relabel_multiindex_duplicates(): + # GH29422, add test for raises senario when getting duplicates + # GH28426, after this change, duplicates should also work if the relabelling is + # different + df = DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) + + result = df.groupby(("x", "group")).agg( + a=(("y", "A"), "min"), b=(("y", "A"), "min") + ) + idx = pd.Index(["a", "b"], name=("x", "group")) + expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) + tm.assert_frame_equal(result, expected) + + def myfunc(s): return np.percentile(s, q=0.90) @@ -554,9 +663,7 @@ def test_maybe_mangle_lambdas_args(self): assert func["A"][0](0, 2, b=3) == (0, 2, 3) def test_maybe_mangle_lambdas_named(self): - func = OrderedDict( - [("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))] - ) + func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}} result = _maybe_mangle_lambdas(func) assert result == func @@ -598,10 +705,7 @@ def test_agg_with_one_lambda(self): } ) - # sort for 35 and earlier columns = ["height_sqr_min", "height_max", "weight_max"] - if compat.PY35: - columns = ["height_max", "height_sqr_min", "weight_max"] expected = pd.DataFrame( { "height_sqr_min": [82.81, 36.00], @@ -640,7 +744,6 @@ def test_agg_multiple_lambda(self): "weight": [7.9, 7.5, 9.9, 198.0], } ) - # sort for 35 and earlier columns = [ "height_sqr_min", "height_max", @@ -648,14 +751,6 @@ def test_agg_multiple_lambda(self): "height_max_2", "weight_min", ] - if compat.PY35: - columns = [ - "height_max", - "height_max_2", - "height_sqr_min", - "weight_max", - "weight_min", - ] expected = pd.DataFrame( { "height_sqr_min": [82.81, 36.00], diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5d50c044cf9f5..5ddda264642de 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5dad868c8c3aa..52ee3e652501c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -2,7 +2,6 @@ test all other .agg behavior """ -from collections import OrderedDict import datetime as dt from functools import partial @@ -19,15 +18,15 @@ date_range, period_range, ) +import pandas._testing as tm from pandas.core.base import SpecificationError -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing def test_agg_api(): # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame( @@ -96,8 +95,7 @@ def test_agg_period_index(): index = period_range(start="1999-01", periods=5, freq="M") s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) - series = [("s1", s1), ("s2", s2)] - df = DataFrame.from_dict(OrderedDict(series)) + df = DataFrame.from_dict({"s1": s1, "s2": s2}) grouped = df.groupby(df.index.month) list(grouped) @@ -211,31 +209,26 @@ def test_aggregate_api_consistency(): expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) - expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) - expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]]) - tm.assert_frame_equal(result, expected, check_like=True) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) def test_agg_dict_renaming_deprecation(): # 15931 df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): df.groupby("A").agg( {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} ) - assert "using a dict with renaming" in str(w[0].message) - assert "named aggregation" in str(w[0].message) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(SpecificationError, match=msg): df.groupby("A")[["B", "C"]].agg({"ma": "max"}) - with tm.assert_produces_warning(FutureWarning) as w: + with pytest.raises(SpecificationError, match=msg): df.groupby("A").B.agg({"foo": "count"}) - assert "using a dict on a Series for aggregation" in str(w[0].message) - assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): @@ -251,18 +244,12 @@ def test_agg_compat(): g = df.groupby(["A", "B"]) - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": ["sum", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) - expected.columns = ["C", "D"] + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": ["sum", "std"]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"C": "sum", "D": "std"}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"C": "sum", "D": "std"}) def test_agg_nested_dicts(): @@ -278,29 +265,20 @@ def test_agg_nested_dicts(): g = df.groupby(["A", "B"]) - msg = r"cannot perform renaming for r[1-2] with a nested dictionary" + msg = r"nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) - expected = pd.concat( - [g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) # same name as the original column # GH9052 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = g["D"].agg({"result1": np.sum, "result2": np.mean}) - expected = expected.rename(columns={"result1": "D"}) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"result1": np.sum, "result2": np.mean}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g["D"].agg({"D": np.sum, "result2": np.mean}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + g["D"].agg({"D": np.sum, "result2": np.mean}) def test_agg_item_by_item_raise_typeerror(): @@ -454,6 +432,31 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") @@ -470,8 +473,7 @@ def test_agg_timezone_round_trip(): assert result3 == ts dates = [ - pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific") - for i in range(1, 5) + pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) ] df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) grouped = df.groupby("A") @@ -602,3 +604,41 @@ def test_agg_lambda_with_timezone(): columns=["date"], ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "err_cls", + [ + NotImplementedError, + RuntimeError, + KeyError, + IndexError, + OSError, + ValueError, + ArithmeticError, + AttributeError, + ], +) +def test_groupby_agg_err_catching(err_cls): + # make sure we suppress anything other than TypeError or AssertionError + # in _python_agg_general + + # Use a non-standard EA to make sure we don't go down ndarray paths + from pandas.tests.extension.decimal.array import DecimalArray, make_data, to_decimal + + data = make_data()[:5] + df = pd.DataFrame( + {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} + ) + + expected = pd.Series(to_decimal([data[0], data[3]])) + + def weird_func(x): + # weird function that raise something other than TypeError or IndexError + # in _python_agg_general + if len(x) == 0: + raise err_cls + return x.iloc[0] + + result = df["decimals"].groupby(df["id1"]).agg(weird_func) + tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index af98f9efe2af9..8901af7a90acc 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_kernels -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.groupby.base import reduction_kernels, transformation_kernels @pytest.fixture @@ -110,3 +110,15 @@ def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. """ return request.param + + +@pytest.fixture(params=transformation_kernels) +def transformation_func(request): + """yields the string names of all groupby transformation functions.""" + return request.param + + +@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels)) +def groupby_func(request): + """yields both aggregation and transformation functions.""" + return request.param diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0e62569fffeb6..2f2f97f2cd993 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, bdate_range -import pandas.util.testing as tm +import pandas._testing as tm def test_apply_issues(): @@ -265,7 +265,7 @@ def desc3(group): result = group.describe() # names are different - result.index.name = "stat_{:d}".format(len(group)) + result.index.name = f"stat_{len(group):d}" result = result[: len(group)] # weirdo @@ -686,6 +686,17 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) +def test_func_returns_object(): + # GH 28652 + df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) + result = df.groupby("a").apply(lambda g: g.index) + expected = Series( + [pd.Int64Index([1]), pd.Int64Index([2])], index=pd.Int64Index([1, 2], name="a") + ) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], @@ -703,3 +714,41 @@ def test_apply_datetime_issue(group_column_dtlike): ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] ) tm.assert_frame_equal(result, expected) + + +def test_apply_series_return_dataframe_groups(): + # GH 10078 + tdf = DataFrame( + { + "day": { + 0: pd.Timestamp("2015-02-24 00:00:00"), + 1: pd.Timestamp("2015-02-24 00:00:00"), + 2: pd.Timestamp("2015-02-24 00:00:00"), + 3: pd.Timestamp("2015-02-24 00:00:00"), + 4: pd.Timestamp("2015-02-24 00:00:00"), + }, + "userAgent": { + 0: "some UA string", + 1: "some UA string", + 2: "some UA string", + 3: "another UA string", + 4: "some UA string", + }, + "userId": { + 0: "17661101", + 1: "17661101", + 2: "17661101", + 3: "17661101", + 4: "17661101", + }, + } + ) + + def most_common_values(df): + return Series({c: s.value_counts().index[0] for c, s in df.iteritems()}) + + result = tdf.groupby("day").apply(most_common_values)["userId"] + expected = pd.Series( + ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 0e7a66769d2d4..ad71f73e80e64 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import ensure_int64 from pandas import Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_series_grouper(): @@ -25,6 +25,16 @@ def test_series_grouper(): tm.assert_almost_equal(counts, exp_counts) +def test_series_grouper_requires_nonempty_raises(): + # GH#29500 + obj = Series(np.random.randn(10)) + dummy = obj[:0] + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + with pytest.raises(ValueError, match="SeriesGrouper requires non-empty `series`"): + libreduction.SeriesGrouper(dummy, np.mean, labels, 2, dummy) + + def test_series_bin_grouper(): obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -77,7 +87,7 @@ def _check(dtype): counts = np.zeros(len(out), dtype=np.int64) labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype)) + func = getattr(groupby, f"group_ohlc_{dtype}") func(out, counts, obj[:, None], labels) def _ohlc(group): @@ -106,15 +116,16 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - result = libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) + msg = "Must pass either dummy and labels, or neither" + # we must pass either both labels and dummy, or neither + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + + with pytest.raises(ValueError, match=msg): + libreduction.compute_reduction( + arr, np.sum, axis=1, labels=Index(np.arange(100)) + ) dummy = Series(0.0, index=np.arange(100)) result = libreduction.compute_reduction( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ad5d2315f7e33..9323946581a0d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import datetime import numpy as np @@ -16,7 +15,7 @@ Series, qcut, ) -import pandas.util.testing as tm +import pandas._testing as tm def cartesian_product_for_groupers(result, args, names): @@ -498,10 +497,10 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): aggr[aggr.isna()] = "missing" if not all(label == aggr): msg = ( - "Labels and aggregation results not consistently sorted\n" - + "for (ordered={}, observed={}, sort={})\n" - + "Result:\n{}" - ).format(ordered, observed, sort, result) + f"Labels and aggregation results not consistently sorted\n" + + "for (ordered={ordered}, observed={observed}, sort={sort})\n" + + "Result:\n{result}" + ) assert False, msg @@ -781,16 +780,32 @@ def test_categorical_no_compress(): tm.assert_numpy_array_equal(result, exp) +def test_groupby_empty_with_category(): + # GH-9614 + # test fix for when group by on None resulted in + # coercion of dtype categorical -> float + df = pd.DataFrame( + {"A": [None] * 3, "B": pd.Categorical(["train", "train", "test"])} + ) + result = df.groupby("A").first()["B"] + expected = pd.Series( + pd.Categorical([], categories=["test", "train"]), + index=pd.Series([], dtype="object", name="A"), + name="B", + ) + tm.assert_series_equal(result, expected) + + def test_sort(): - # http://stackoverflow.com/questions/23814368/sorting-pandas- + # https://stackoverflow.com/questions/23814368/sorting-pandas- # categorical-labels-after-groupby # This should result in a properly sorted Series so that the plot # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) @@ -1111,7 +1126,7 @@ def test_seriesgroupby_observed_true(df_cat, operation, kwargs): index = MultiIndex.from_frame( DataFrame( {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, - **kwargs + **kwargs, ) ) expected = Series(data=[1, 3, 2, 4], index=index, name="C") @@ -1188,11 +1203,18 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): # GH 24880 expected = Series(data=data, index=index, name="C") result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply( - lambda x: OrderedDict([("min", x.min()), ("max", x.max())]) + lambda x: {"min": x.min(), "max": x.max()} ) tm.assert_series_equal(result, expected) +def test_groupby_categorical_series_dataframe_consistent(df_cat): + # GH 20416 + expected = df_cat.groupby(["A", "B"])["C"].mean() + result = df_cat.groupby(["A", "B"]).mean()["C"] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) def test_groupby_categorical_axis_1(code): # GH 13420 @@ -1229,3 +1251,94 @@ def test_get_nonexistent_category(): {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) + + +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): + # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) + args = {"nth": [0]}.get(reduction_func, []) + + expected_length = 4 if observed else 16 + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + + assert len(result) == expected_length + + +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer) + + +def test_series_groupby_categorical_aggregation_getitem(): + # GH 8870 + d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} + df = pd.DataFrame(d) + cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=True, sort=True) + result = groups["foo"].agg("mean") + expected = groups.agg("mean")["foo"] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 9882f12714d2d..b4239d7d34a90 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCounting: @@ -20,7 +20,7 @@ def test_cumcount(self): def test_cumcount_empty(self): ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) + se = Series(dtype=object).groupby(level=0) # edge case, as this is usually considered float e = Series(dtype="int64") @@ -95,7 +95,7 @@ def test_ngroup_one_group(self): def test_ngroup_empty(self): ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) + se = Series(dtype=object).groupby(level=0) # edge case, as this is usually considered float e = Series(dtype="int64") @@ -197,11 +197,8 @@ def test_ngroup_respects_groupby_order(self): @pytest.mark.parametrize( "datetimelike", [ - [ - Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i)) - for i in range(1, 4) - ], - [Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)], [Timedelta(x, unit="h") for x in range(1, 4)], [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], ], diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 2ce04fc774083..c16ad812eb634 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_filter_series(): @@ -593,5 +593,5 @@ def test_filter_dropna_with_empty_groups(): tm.assert_series_equal(result_false, expected_false) result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) - expected_true = pd.Series(index=pd.Index([], dtype=int)) + expected_true = pd.Series(index=pd.Index([], dtype=int), dtype=np.float64) tm.assert_series_equal(result_true, expected_true) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 2d7dfe49dc038..97cf1af1d2e9e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -20,8 +20,9 @@ date_range, isna, ) +import pandas._testing as tm import pandas.core.nanops as nanops -from pandas.util import _test_decorators as td, testing as tm +from pandas.util import _test_decorators as td @pytest.mark.parametrize("agg_func", ["any", "all"]) @@ -102,9 +103,7 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( - result.shape, ngroups - ) + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg tm.assert_frame_equal( @@ -607,6 +606,51 @@ def test_nlargest(): tm.assert_series_equal(gb.nlargest(3, keep="last"), e) +def test_nlargest_mi_grouper(): + # see gh-21411 + npr = np.random.RandomState(123456789) + + dts = date_range("20180101", periods=10) + iterables = [dts, ["one", "two"]] + + idx = MultiIndex.from_product(iterables, names=["first", "second"]) + s = Series(npr.randn(20), index=idx) + + result = s.groupby("first").nlargest(1) + + exp_idx = MultiIndex.from_tuples( + [ + (dts[0], dts[0], "one"), + (dts[1], dts[1], "one"), + (dts[2], dts[2], "one"), + (dts[3], dts[3], "two"), + (dts[4], dts[4], "one"), + (dts[5], dts[5], "one"), + (dts[6], dts[6], "one"), + (dts[7], dts[7], "one"), + (dts[8], dts[8], "two"), + (dts[9], dts[9], "one"), + ], + names=["first", "first", "second"], + ) + + exp_values = [ + 2.2129019979039612, + 1.8417114045748335, + 0.858963679564603, + 1.3759151378258088, + 0.9430284594687134, + 0.5296914208183142, + 0.8318045593815487, + -0.8476703342910327, + 0.3804446884133735, + -0.8028845810770998, + ] + + expected = Series(exp_values, index=exp_idx) + tm.assert_series_equal(result, expected, check_exact=False, check_less_precise=True) + + def test_nsmallest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list("a" * 5 + "b" * 5)) @@ -1002,7 +1046,7 @@ def test_nunique_with_object(): def test_nunique_with_empty_series(): # GH 12553 - data = pd.Series(name="name") + data = pd.Series(name="name", dtype=object) result = data.groupby(level=0).nunique() expected = pd.Series(name="name", dtype="int64") tm.assert_series_equal(result, expected) @@ -1255,8 +1299,8 @@ def test_size_groupby_all_null(): ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), # Timestamps ( - [x for x in pd.date_range("1/1/18", freq="D", periods=5)], - [x for x in pd.date_range("1/1/18", freq="D", periods=5)][::-1], + list(pd.date_range("1/1/18", freq="D", periods=5)), + list(pd.date_range("1/1/18", freq="D", periods=5))[::-1], ), # All NA ([np.nan] * 5, [np.nan] * 5), @@ -1353,6 +1397,35 @@ def test_quantile_array_multiple_levels(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = pd.DataFrame( + np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) + ) + + idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = pd.DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e17181f55fdba..7e374811d1960 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import datetime from decimal import Decimal from io import StringIO @@ -10,8 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +import pandas._testing as tm +from pandas.core.base import SpecificationError import pandas.core.common as com -import pandas.util.testing as tm def test_repr(): @@ -55,8 +55,9 @@ def test_basic(dtype): # complex agg agged = grouped.aggregate([np.mean, np.std]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - agged = grouped.aggregate({"one": np.mean, "two": np.std}) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped.aggregate({"one": np.mean, "two": np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -452,9 +453,9 @@ def test_frame_set_name_single(df): result = grouped["C"].agg([np.mean, np.std]) assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped["C"].agg({"foo": np.mean, "bar": np.std}) - assert result.index.name == "A" + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"foo": np.mean, "bar": np.std}) def test_multi_func(df): @@ -587,6 +588,20 @@ def test_groupby_multiple_columns(df, op): tm.assert_series_equal(result, expected) +def test_as_index_select_column(): + # GH 5764 + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + result = df.groupby("A", as_index=False)["B"].get_group(1) + expected = pd.Series([2, 4], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + expected = pd.Series( + [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) + ) + tm.assert_series_equal(result, expected) + + def test_groupby_as_index_agg(df): grouped = df.groupby("A", as_index=False) @@ -596,18 +611,16 @@ def test_groupby_as_index_agg(df): expected = grouped.mean() tm.assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) + result2 = grouped.agg({"C": np.mean, "D": np.sum}) expected2 = grouped.mean() expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) grouped = df.groupby("A", as_index=True) - expected3 = grouped["C"].sum() - expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result3 = grouped["C"].agg({"Q": np.sum}) - tm.assert_frame_equal(result3, expected3) + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + grouped["C"].agg({"Q": np.sum}) # multi-key @@ -617,7 +630,7 @@ def test_groupby_as_index_agg(df): expected = grouped.mean() tm.assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) + result2 = grouped.agg({"C": np.mean, "D": np.sum}) expected2 = grouped.mean() expected2["D"] = grouped.sum()["D"] tm.assert_frame_equal(result2, expected2) @@ -772,7 +785,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) @@ -922,7 +935,7 @@ def test_mutate_groups(): + ["c"] * 2 + ["d"] * 2 + ["e"] * 2, - "cat3": ["g{}".format(x) for x in range(1, 15)], + "cat3": [f"g{x}" for x in range(1, 15)], "val": np.random.randint(100, size=14), } ) @@ -1703,13 +1716,20 @@ def test_group_shift_with_fill_value(): tm.assert_frame_equal(result, expected) +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = pd.Timestamp.utcnow() + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = pd.DataFrame( { - "eventDate": pd.date_range( - pd.datetime.today(), periods=20, freq="M" - ).tolist(), + "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), "thename": range(0, 20), } ) @@ -1734,34 +1754,23 @@ def test_empty_dataframe_groupby(): tm.assert_frame_equal(result, expected) -def test_tuple_warns(): +def test_tuple_as_grouping(): # https://github.com/pandas-dev/pandas/issues/18314 df = pd.DataFrame( { - ("a", "b"): [1, 1, 2, 2], - "a": [1, 1, 1, 2], - "b": [1, 2, 2, 2], + ("a", "b"): [1, 1, 1, 1], + "a": [2, 2, 2, 2], + "b": [2, 2, 2, 2], "c": [1, 1, 1, 1], } ) - with tm.assert_produces_warning(FutureWarning) as w: - df[["a", "b", "c"]].groupby(("a", "b")).c.mean() - - assert "Interpreting tuple 'by' as a list" in str(w[0].message) - - with tm.assert_produces_warning(None): - df.groupby(("a", "b")).c.mean() - - -def test_tuple_warns_unhashable(): - # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start="4/1/2014", end="6/30/2014", freq="B") - df = DataFrame(1, index=business_dates, columns=["a", "b"]) - with tm.assert_produces_warning(FutureWarning) as w: - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + with pytest.raises(KeyError): + df[["a", "b", "c"]].groupby(("a", "b")) - assert "Interpreting tuple 'by' as a list" in str(w[0].message) + result = df.groupby(("a", "b"))["c"].sum() + expected = pd.Series([4], name="c", index=pd.Index([1], name=("a", "b"))) + tm.assert_series_equal(result, expected) def test_tuple_correct_keyerror(): @@ -1951,3 +1960,73 @@ def test_groupby_only_none_group(): expected = pd.Series([np.nan], name="x") tm.assert_series_equal(actual, expected) + + +def test_groupby_duplicate_index(): + # GH#29189 the groupby call here used to raise + ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + gb = ser.groupby(level=0) + + result = gb.mean() + expected = pd.Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) +def test_bool_aggs_dup_column_labels(bool_agg_func): + # 21668 + df = pd.DataFrame([[True, True]], columns=["a", "a"]) + grp_by = df.groupby([0]) + result = getattr(grp_by, bool_agg_func)() + + expected = df + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] +) +def test_dup_labels_output_shape(groupby_func, idx): + if groupby_func in {"size", "ngroup", "cumcount"}: + pytest.skip("Not applicable") + + df = pd.DataFrame([[1, 1]], columns=idx) + grp_by = df.groupby([0]) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(df) + elif groupby_func == "tshift": + df.index = [pd.Timestamp("today")] + args.extend([1, "D"]) + + result = getattr(grp_by, groupby_func)(*args) + + assert result.shape == (1, 2) + tm.assert_index_equal(result.columns, idx) + + +def test_groupby_crash_on_nunique(axis): + # Fix following 30253 + df = pd.DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) + + axis_number = df._get_axis_number(axis) + if not axis_number: + df = df.T + + result = df.groupby(axis=axis_number, level=0).nunique() + + expected = pd.DataFrame({"A": [1, 2], "D": [1, 1]}) + if not axis_number: + expected = expected.T + + tm.assert_frame_equal(result, expected) + + +def test_groupby_list_level(): + # GH 9790 + expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + result = expected.groupby(level=[0]).mean() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e1fd8d7da6833..70ba21d89d22f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -13,8 +13,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm # selection # -------------------------------- @@ -71,14 +71,12 @@ def test_getitem_list_of_columns(self): ) result = df.groupby("A")[["C", "D"]].mean() - result2 = df.groupby("A")["C", "D"].mean() - result3 = df.groupby("A")[df.columns[2:4]].mean() + result2 = df.groupby("A")[df.columns[2:4]].mean() expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) def test_getitem_numeric_column_names(self): # GH #13731 @@ -91,14 +89,40 @@ def test_getitem_numeric_column_names(self): } ) result = df.groupby(0)[df.columns[1:3]].mean() - result2 = df.groupby(0)[2, 4].mean() - result3 = df.groupby(0)[[2, 4]].mean() + result2 = df.groupby(0)[[2, 4]].mean() expected = df.loc[:, [0, 2, 4]].groupby(0).mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) + + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby(0)[2, 4].mean() + + def test_getitem_single_list_of_columns(self, df): + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby("A")["C", "D"].mean() + + def test_getitem_single_column(self): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) + + result = df.groupby("A")["C"].mean() + + as_frame = df.loc[:, ["A", "C"]].groupby("A").mean() + as_series = as_frame.iloc[:, 0] + expected = as_series + + tm.assert_series_equal(result, expected) # grouping @@ -501,15 +525,17 @@ def test_groupby_level(self, sort, mframe, df): with pytest.raises(ValueError, match=msg): df.groupby(level=1) - def test_groupby_level_index_names(self): + def test_groupby_level_index_names(self, axis): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( "exp" ) - df.groupby(level="exp") - msg = "level name foo is not the name of the index" + if axis in (1, "columns"): + df = df.T + df.groupby(level="exp", axis=axis) + msg = f"level name foo is not the name of the {df._get_axis_name(axis)}" with pytest.raises(ValueError, match=msg): - df.groupby(level="foo") + df.groupby(level="foo", axis=axis) @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): @@ -559,12 +585,12 @@ def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 @@ -585,9 +611,18 @@ def test_list_grouper_with_nat(self): @pytest.mark.parametrize( "func,expected", [ - ("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), - ("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))), - ("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))), + ( + "transform", + pd.Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), + ), + ( + "agg", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), + ( + "apply", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), ], ) def test_evaluate_with_empty_groups(self, func, expected): @@ -602,7 +637,7 @@ def test_evaluate_with_empty_groups(self, func, expected): def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 - s = pd.Series([], name="name") + s = pd.Series([], name="name", dtype="float64") gr = s.groupby([]) result = gr.mean() @@ -731,7 +766,7 @@ def test_get_group_grouped_by_tuple(self): def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () - series = pd.Series(data, index) + series = pd.Series(data, index, dtype=object) grouper = pd.Grouper(freq="D") grouped = series.groupby(grouper) assert next(iter(grouped), None) is None diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index f5c8873ff9417..971a447b84cae 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[["inner"], ["inner", "outer"]]) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index f83b284a35377..0f850f2e94581 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_first_last_nth(df): @@ -89,6 +89,25 @@ def test_first_last_nth_dtypes(df_mixed_floats): assert f.dtype == "int64" +def test_first_strings_timestamps(): + # GH 11244 + test = pd.DataFrame( + { + pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], + pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8f0df9051fc73..3461bf6e10662 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, concat +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm def test_rank_apply(): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 109382d97440e..6b8bd9e805a0c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper -import pandas.util.testing as tm class TestGroupBy: diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index db44a4a57230c..6c05c4038a829 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -18,8 +18,8 @@ concat, date_range, ) +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm def assert_fp_equal(a, b): @@ -319,7 +319,7 @@ def test_dispatch_transform(tsframe): def test_transform_select_columns(df): f = lambda x: x.mean() - result = df.groupby("A")["C", "D"].transform(f) + result = df.groupby("A")[["C", "D"]].transform(f) selection = df[["C", "D"]] expected = selection.groupby(df["A"]).transform(f) @@ -765,9 +765,12 @@ def test_transform_with_non_scalar_group(): ], ) @pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) -def test_transform_numeric_ret(cols, exp, comp_func, agg_func): +def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): if agg_func == "size" and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with NDFrameGroupy") + # https://github.com/pytest-dev/pytest/issues/6300 + # workaround to xfail fixture/param permutations + reason = "'size' transformation not supported with NDFrameGroupy" + request.node.add_marker(pytest.mark.xfail(reason=reason)) # GH 19200 df = pd.DataFrame( @@ -874,27 +877,19 @@ def test_pad_stable_sorting(fill_method): ), ], ) -@pytest.mark.parametrize( - "periods,fill_method,limit", - [ - (1, "ffill", None), - (1, "ffill", 1), - (1, "bfill", None), - (1, "bfill", 1), - (-1, "ffill", None), - (-1, "ffill", 1), - (-1, "bfill", None), - (-1, "bfill", 1), - ], -) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) def test_pct_change(test_series, freq, periods, fill_method, limit): - # GH 21200, 21621 + # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 @@ -911,6 +906,41 @@ def test_pct_change(test_series, freq, periods, fill_method, limit): tm.assert_frame_equal(result, expected.to_frame("vals")) +@pytest.mark.parametrize( + "func, expected_status", + [ + ("ffill", ["shrt", "shrt", "lng", np.nan, "shrt", "ntrl", "ntrl"]), + ("bfill", ["shrt", "lng", "lng", "shrt", "shrt", "ntrl", np.nan]), + ], +) +def test_ffill_bfill_non_unique_multilevel(func, expected_status): + # GH 19437 + date = pd.to_datetime( + [ + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-01", + "2018-01-02", + "2018-01-01", + "2018-01-02", + ] + ) + symbol = ["MSFT", "MSFT", "MSFT", "AAPL", "AAPL", "TSLA", "TSLA"] + status = ["shrt", np.nan, "lng", np.nan, "shrt", "ntrl", np.nan] + + df = DataFrame({"date": date, "symbol": symbol, "status": status}) + df = df.set_index(["date", "symbol"]) + result = getattr(df.groupby("symbol")["status"], func)() + + index = MultiIndex.from_tuples( + tuples=list(zip(*[date, symbol])), names=["date", "symbol"] + ) + expected = Series(expected_status, index=index, name="status") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 @@ -932,9 +962,7 @@ def demean_rename(x): if isinstance(x, pd.Series): return result - result = result.rename( - columns={c: "{}_demeaned".format(c) for c in result.columns} - ) + result = result.rename(columns={c: "{c}_demeaned" for c in result.columns}) return result @@ -1073,3 +1101,70 @@ def test_transform_lambda_with_datetimetz(): name="time", ) tm.assert_series_equal(result, expected) + + +def test_transform_fastpath_raises(): + # GH#29631 case where fastpath defined in groupby.generic _choose_path + # raises, but slow_path does not + + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + + def func(grp): + # we want a function such that func(frame) fails but func.apply(frame) + # works + if grp.ndim == 2: + # Ensure that fast_path fails + raise NotImplementedError("Don't cross the streams") + return grp * 2 + + # Check that the fastpath raises, see _transform_general + obj = gb._obj_with_exclusions + gen = gb.grouper.get_iterator(obj, axis=gb.axis) + fast_path, slow_path = gb._define_paths(func) + _, group = next(gen) + + with pytest.raises(NotImplementedError, match="Don't cross the streams"): + fast_path(group) + + result = gb.transform(func) + + expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) + tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 363c5a9af0180..c86cb4532bc26 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,8 +9,8 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime +import pandas._testing as tm # our starting frame @@ -47,7 +47,7 @@ def seed_df(seed_nans, n, m): keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append("{}-{}-{}".format(k, n, m)) + ids.append(f"{k}-{n}-{m}") @pytest.mark.slow @@ -79,3 +79,31 @@ def rebuild_index(df): # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +def test_series_groupby_value_counts_with_grouper(): + # GH28479 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + dfg = df.groupby(Grouper(freq="1D", key="Datetime")) + + # have to sort on index because of unstable sort on values xref GH9212 + result = dfg["Food"].value_counts().sort_index() + expected = dfg["Food"].apply(Series.value_counts).sort_index() + expected.index.names = result.index.names + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 58407d90a2cc8..8e387e9202ef6 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -9,12 +9,12 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.groupby.base import ( groupby_other_methods, reduction_kernels, transformation_kernels, ) -import pandas.util.testing as tm AGG_FUNCTIONS = [ "sum", @@ -236,16 +236,23 @@ def test_groupby_blacklist(df_letters): blacklist.extend(to_methods) - # e.g., to_csv - defined_but_not_allowed = "(?:^Cannot.+{0!r}.+{1!r}.+try using the 'apply' method$)" - - # e.g., query, eval - not_defined = "(?:^{1!r} object has no attribute {0!r}$)" - fmt = defined_but_not_allowed + "|" + not_defined for bl in blacklist: for obj in (df, s): gb = obj.groupby(df.letters) - msg = fmt.format(bl, type(gb).__name__) + + # e.g., to_csv + defined_but_not_allowed = ( + f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try " + f"using the 'apply' method$)" + ) + + # e.g., query, eval + not_defined = ( + f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)" + ) + + msg = f"{defined_but_not_allowed}|{not_defined}" + with pytest.raises(AttributeError, match=msg): getattr(gb, bl) @@ -397,7 +404,7 @@ def test_all_methods_categorized(mframe): # new public method? if new_names: - msg = """ + msg = f""" There are uncatgeorized methods defined on the Grouper class: {names}. @@ -411,19 +418,19 @@ def test_all_methods_categorized(mframe): see the comments in pandas/core/groupby/base.py for guidance on how to fix this test. """ - raise AssertionError(msg.format(names=names)) + raise AssertionError(msg) # removed a public method? all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods print(names) print(all_categorized) if not (names == all_categorized): - msg = """ + msg = f""" Some methods which are supposed to be on the Grouper class are missing: -{names}. +{all_categorized - names}. They're still defined in one of the lists that live in pandas/core/groupby/base.py. If you removed a method, you should update them """ - raise AssertionError(msg.format(names=all_categorized - names)) + raise AssertionError(msg) diff --git a/pandas/tests/indexes/categorical/__init__.py b/pandas/tests/indexes/categorical/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/categorical/test_category.py similarity index 84% rename from pandas/tests/indexes/test_category.py rename to pandas/tests/indexes/categorical/test_category.py index 61d9d1d70c360..e027641288bb9 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -5,14 +5,14 @@ from pandas._libs import index as libindex -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import Categorical, IntervalIndex +import pandas._testing as tm from pandas.core.indexes.api import CategoricalIndex, Index -import pandas.util.testing as tm -from .common import Base +from ..common import Base class TestCategoricalIndex(Base): @@ -32,147 +32,6 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True - def test_construction(self): - - ci = self.create_index(categories=list("abcd")) - categories = ci.categories - - result = Index(ci) - tm.assert_index_equal(result, ci, exact=True) - assert not result.ordered - - result = Index(ci.values) - tm.assert_index_equal(result, ci, exact=True) - assert not result.ordered - - # empty - result = CategoricalIndex(categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) - assert not result.ordered - - # passing categories - result = CategoricalIndex(list("aabbca"), categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - - c = pd.Categorical(list("aabbca")) - result = CategoricalIndex(c) - tm.assert_index_equal(result.categories, Index(list("abc"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(c, categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - ci = CategoricalIndex(c, categories=list("abcd")) - result = CategoricalIndex(ci) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(ci, categories=list("ab")) - tm.assert_index_equal(result.categories, Index(list("ab"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(ci, categories=list("ab"), ordered=True) - tm.assert_index_equal(result.categories, Index(list("ab"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") - ) - assert result.ordered - - result = pd.CategoricalIndex(ci, categories=list("ab"), ordered=True) - expected = pd.CategoricalIndex( - ci, categories=list("ab"), ordered=True, dtype="category" - ) - tm.assert_index_equal(result, expected, exact=True) - - # turn me to an Index - result = Index(np.array(ci)) - assert isinstance(result, Index) - assert not isinstance(result, CategoricalIndex) - - def test_construction_with_dtype(self): - - # specify dtype - ci = self.create_index(categories=list("abc")) - - result = Index(np.array(ci), dtype="category") - tm.assert_index_equal(result, ci, exact=True) - - result = Index(np.array(ci).tolist(), dtype="category") - tm.assert_index_equal(result, ci, exact=True) - - # these are generally only equal when the categories are reordered - ci = self.create_index() - - result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) - tm.assert_index_equal(result, ci, exact=True) - - # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) - idx = Index(range(3)) - result = CategoricalIndex(idx, categories=idx, ordered=True) - tm.assert_index_equal(result, expected, exact=True) - - def test_construction_empty_with_bool_categories(self): - # see gh-22702 - cat = pd.CategoricalIndex([], categories=[True, False]) - categories = sorted(cat.categories.tolist()) - assert categories == [False, True] - - def test_construction_with_categorical_dtype(self): - # construction with CategoricalDtype - # GH18109 - data, cats, ordered = "a a b b".split(), "c b a".split(), True - dtype = CategoricalDtype(categories=cats, ordered=ordered) - - result = CategoricalIndex(data, dtype=dtype) - expected = CategoricalIndex(data, categories=cats, ordered=ordered) - tm.assert_index_equal(result, expected, exact=True) - - # GH 19032 - result = Index(data, dtype=dtype) - tm.assert_index_equal(result, expected, exact=True) - - # error when combining categories/ordered and dtype kwargs - msg = "Cannot specify `categories` or `ordered` together with `dtype`." - with pytest.raises(ValueError, match=msg): - CategoricalIndex(data, categories=cats, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - Index(data, categories=cats, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - CategoricalIndex(data, ordered=ordered, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - Index(data, ordered=ordered, dtype=dtype) - - def test_create_categorical(self): - # https://github.com/pandas-dev/pandas/pull/17513 - # The public CI constructor doesn't hit this code path with - # instances of CategoricalIndex, but we still want to test the code - ci = CategoricalIndex(["a", "b", "c"]) - # First ci is self, second ci is data. - result = CategoricalIndex._create_categorical(ci, ci) - expected = Categorical(["a", "b", "c"]) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize( "func,op_name", [ @@ -184,12 +43,12 @@ def test_create_categorical(self): (lambda idx: ["a", "b"] + idx, "__radd__"), ], ) - def test_disallow_set_ops(self, func, op_name): + def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) - msg = "cannot perform {} with this index type: CategoricalIndex" - with pytest.raises(TypeError, match=msg.format(op_name)): + msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + with pytest.raises(TypeError, match=msg): func(idx) def test_method_delegation(self): @@ -439,8 +298,8 @@ def test_insert(self): # invalid msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): ci.insert(0, "d") @@ -525,17 +384,6 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): expected = index tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_astype_category_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - idx = CategoricalIndex(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning): - idx.astype(cdt2) - def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) @@ -680,8 +528,8 @@ def test_get_indexer(self): tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) msg = ( - "method='pad' and method='backfill' not implemented yet for" - " CategoricalIndex" + "method='pad' and method='backfill' not implemented yet for " + "CategoricalIndex" ) with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") @@ -788,8 +636,10 @@ def test_ensure_copied_data(self, indices): # Index.__new__ is honored. # # Must be tested separately from other indexes because - # self.value is not an ndarray. - _base = lambda ar: ar if ar.base is None else ar.base + # self.values is not an ndarray. + # GH#29918 Index.base has been removed + # FIXME: is this test still meaningful? + _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base result = CategoricalIndex(indices.values, copy=True) tm.assert_index_equal(indices, result) @@ -823,8 +673,8 @@ def test_equals_categorical(self): ci1 == Index(["a", "b", "c"]) msg = ( - "categorical index comparisons must have the same categories" - " and ordered attributes" + "categorical index comparisons must have the same categories " + "and ordered attributes" "|" "Categoricals can only be compared if 'categories' are the same. " "Categories are different lengths" @@ -1125,3 +975,9 @@ def test_engine_type(self, dtype, engine_type): ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="cannot mask with array containing NA"): + idx[:, None] diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py new file mode 100644 index 0000000000000..1df0874e2f947 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest + +from pandas import Categorical, CategoricalDtype, CategoricalIndex, Index +import pandas._testing as tm + + +class TestCategoricalIndexConstructors: + def test_construction(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + result = Index(ci.values) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + # empty + result = CategoricalIndex(categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) + assert not result.ordered + + # passing categories + result = CategoricalIndex(list("aabbca"), categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + + c = Categorical(list("aabbca")) + result = CategoricalIndex(c) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(c, categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + ci = CategoricalIndex(c, categories=list("abcd")) + result = CategoricalIndex(ci) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) + tm.assert_index_equal(result, expected, exact=True) + + # turn me to an Index + result = Index(np.array(ci)) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = CategoricalIndex(list("aabbca"), categories=list("abc"), ordered=False) + + result = Index(np.array(ci), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + result = Index(np.array(ci).tolist(), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + # these are generally only equal when the categories are reordered + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) + tm.assert_index_equal(result, ci, exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_construction_empty_with_bool_categories(self): + # see GH#22702 + cat = CategoricalIndex([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH#18109 + data, cats, ordered = "a a b b".split(), "c b a".split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(data, categories=cats, ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # GH#19032 + result = Index(data, dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + # error when combining categories/ordered and dtype kwargs + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, ordered=ordered, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, ordered=ordered, dtype=dtype) + + def test_create_categorical(self): + # GH#17513 The public CI constructor doesn't hit this code path with + # instances of CategoricalIndex, but we still want to test the code + ci = CategoricalIndex(["a", "b", "c"]) + # First ci is self, second ci is data. + result = CategoricalIndex._create_categorical(ci, ci) + expected = Categorical(["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 1ac6370860ba6..a16017b0e12c0 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -23,15 +23,15 @@ UInt64Index, isna, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm class Base: """ base class for index sub-class tests """ - _holder = None # type: Optional[Type[Index]] + _holder: Optional[Type[Index]] = None _compat_props = ["shape", "ndim", "size", "nbytes"] def test_pickle_compat_construction(self): @@ -103,6 +103,13 @@ def test_shift(self): with pytest.raises(NotImplementedError, match=msg): idx.shift(1, 2) + def test_constructor_name_unhashable(self): + # GH#29069 check that name is hashable + # See also same-named test in tests.series.test_constructors + idx = self.create_index() + with pytest.raises(TypeError, match="Index.name must be a hashable type"): + type(idx)(idx, name=[]) + def test_create_index_existing_name(self): # GH11193, when an existing index is passed, and a new name is not @@ -244,7 +251,7 @@ def test_str(self): idx = self.create_index() idx.name = "foo" assert "'foo'" in str(idx) - assert idx.__class__.__name__ in str(idx) + assert type(idx).__name__ in str(idx) def test_repr_max_seq_item_setting(self): # GH10182 @@ -260,8 +267,8 @@ def test_copy_name(self, indices): if isinstance(indices, MultiIndex): return - first = indices.__class__(indices, copy=True, name="mario") - second = first.__class__(first, copy=False) + first = type(indices)(indices, copy=True, name="mario") + second = type(first)(first, copy=False) # Even though "copy=False", we want a new object. assert first is not second @@ -292,7 +299,7 @@ def test_ensure_copied_data(self, indices): # MultiIndex and CategoricalIndex are tested separately return - index_type = indices.__class__ + index_type = type(indices) result = index_type(indices.values, copy=True, **init_kwargs) tm.assert_index_equal(indices, result) tm.assert_numpy_array_equal( @@ -502,7 +509,7 @@ def test_difference_base(self, sort, indices): cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(indices, (DatetimeIndex, TimedeltaIndex)): - assert result.__class__ == answer.__class__ + assert type(result) == type(answer) tm.assert_numpy_array_equal( result.sort_values().asi8, answer.sort_values().asi8 ) @@ -677,9 +684,9 @@ def test_hasnans_isnans(self, indices): values[1] = np.nan if isinstance(indices, PeriodIndex): - idx = indices.__class__(values, freq=indices.freq) + idx = type(indices)(values, freq=indices.freq) else: - idx = indices.__class__(values) + idx = type(indices)(values) expected = np.array([False] * len(idx), dtype=bool) expected[1] = True @@ -716,9 +723,9 @@ def test_fillna(self, indices): values[1] = np.nan if isinstance(indices, PeriodIndex): - idx = indices.__class__(values, freq=indices.freq) + idx = type(indices)(values, freq=indices.freq) else: - idx = indices.__class__(values) + idx = type(indices)(values) expected = np.array([False] * len(idx), dtype=bool) expected[1] = True @@ -868,3 +875,11 @@ def test_engine_reference_cycle(self): nrefs_pre = len(gc.get_referrers(index)) index._engine assert len(gc.get_referrers(index)) == nrefs_pre + + def test_getitem_2d_deprecated(self): + # GH#30588 + idx = self.create_index() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + res = idx[:, None] + + assert isinstance(res, np.ndarray), type(res) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 2a9a8bf8d824f..e3e7ff4093b76 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.indexes.api import Index, MultiIndex -import pandas.util.testing as tm indices_dict = { "unicode": tm.makeUnicodeIndex(100), diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index f7cded9f44918..3c72d34d84b28 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .common import Base @@ -38,7 +38,7 @@ def test_str(self): idx.name = "foo" assert not "length={}".format(len(idx)) in str(idx) assert "'foo'" in str(idx) - assert idx.__class__.__name__ in str(idx) + assert type(idx).__name__ in str(idx) if hasattr(idx, "tz"): if idx.tz is not None: @@ -72,7 +72,7 @@ def test_map_callable(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index), + lambda values, index: pd.Series(values, index, dtype=object), ], ) def test_map_dictlike(self, mapper): @@ -81,7 +81,7 @@ def test_map_dictlike(self, mapper): # don't compare the freqs if isinstance(expected, pd.DatetimeIndex): - expected.freq = None + expected._data.freq = None result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) @@ -95,10 +95,3 @@ def test_map_dictlike(self, mapper): expected = pd.Index([np.nan] * len(index)) result = index.map(mapper([], [])) tm.assert_index_equal(result, expected) - - def test_asobject_deprecated(self): - # GH18572 - d = self.create_index() - with tm.assert_produces_warning(FutureWarning): - i = d.asobject - assert isinstance(i, pd.Index) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index eabf293ae915f..6139726dc34e4 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -17,7 +17,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_constructors.py similarity index 90% rename from pandas/tests/indexes/datetimes/test_construction.py rename to pandas/tests/indexes/datetimes/test_constructors.py index 88bc11c588673..ffe51dd1fb9f5 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import datetime, timedelta from functools import partial from operator import attrgetter @@ -10,17 +10,9 @@ from pandas._libs.tslibs import OutOfBoundsDatetime, conversion import pandas as pd -from pandas import ( - DatetimeIndex, - Index, - Timestamp, - date_range, - datetime, - offsets, - to_datetime, -) +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets, to_datetime +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, period_array -import pandas.util.testing as tm class TestDatetimeIndex: @@ -37,6 +29,25 @@ def test_freq_validation_with_nat(self, dt_cls): with pytest.raises(ValueError, match=msg): dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") + # TODO: better place for tests shared by DTI/TDI? + @pytest.mark.parametrize( + "index", + [ + pd.date_range("2016-01-01", periods=5, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=5), + ], + ) + def test_shallow_copy_inherits_array_freq(self, index): + # If we pass a DTA/TDA to shallow_copy and dont specify a freq, + # we should inherit the array's freq, not our own. + array = index._data + + arr = array[[0, 3, 2, 4, 1]] + assert arr.freq is None + + result = index._shallow_copy(arr) + assert result.freq is None + def test_categorical_preserves_tz(self): # GH#18664 retain tz when going DTI-->Categorical-->DTI # TODO: parametrize over DatetimeIndex/DatetimeArray @@ -70,28 +81,21 @@ def test_dti_with_period_data_raises(self): with pytest.raises(TypeError, match="PeriodDtype data is invalid"): to_datetime(period_array(data)) - def test_dti_with_timedelta64_data_deprecation(self): - # GH#23675 + def test_dti_with_timedelta64_data_raises(self): + # GH#23675 deprecated, enforrced in GH#29794 data = np.array([0], dtype="m8[ns]") - with tm.assert_produces_warning(FutureWarning): - result = DatetimeIndex(data) - - assert result[0] == Timestamp("1970-01-01") - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = to_datetime(data) - - assert result[0] == Timestamp("1970-01-01") - - with tm.assert_produces_warning(FutureWarning): - result = DatetimeIndex(pd.TimedeltaIndex(data)) + msg = r"timedelta64\[ns\] cannot be converted to datetime64" + with pytest.raises(TypeError, match=msg): + DatetimeIndex(data) - assert result[0] == Timestamp("1970-01-01") + with pytest.raises(TypeError, match=msg): + to_datetime(data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = to_datetime(pd.TimedeltaIndex(data)) + with pytest.raises(TypeError, match=msg): + DatetimeIndex(pd.TimedeltaIndex(data)) - assert result[0] == Timestamp("1970-01-01") + with pytest.raises(TypeError, match=msg): + to_datetime(pd.TimedeltaIndex(data)) def test_construction_caching(self): @@ -129,15 +133,11 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): i = pd.date_range("20130101", periods=5, freq="H", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} - if str(tz) in ("UTC", "tzutc()", "UTC+00:00"): - warn = None - else: - warn = FutureWarning + if "tz" in kwargs: + result = DatetimeIndex(i.asi8, tz="UTC").tz_convert(kwargs["tz"]) - with tm.assert_produces_warning(warn, check_stacklevel=False): - result = DatetimeIndex(i.tz_localize(None).asi8, **kwargs) - expected = DatetimeIndex(i, **kwargs) - tm.assert_index_equal(result, expected) + expected = DatetimeIndex(i, **kwargs) + tm.assert_index_equal(result, expected) # localize into the provided tz i2 = DatetimeIndex(i.tz_localize(None).asi8, tz="UTC") @@ -492,21 +492,13 @@ def test_construction_with_ndarray(self): expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") tm.assert_index_equal(result, expected) - def test_verify_integrity_deprecated(self): - # GH#23919 - with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(["1/1/2000"], verify_integrity=False) + def test_integer_values_and_tz_interpreted_as_utc(self): + # GH-24559 + val = np.datetime64("2000-01-01 00:00:00", "ns") + values = np.array([val.view("i8")]) - def test_range_kwargs_deprecated(self): - # GH#23919 - with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start="1/1/2000", end="1/10/2000", freq="D") + result = DatetimeIndex(values).tz_localize("US/Central") - def test_integer_values_and_tz_deprecated(self): - # GH-24559 - values = np.array([946684800000000000]) - with tm.assert_produces_warning(FutureWarning): - result = DatetimeIndex(values, tz="US/Central") expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") tm.assert_index_equal(result, expected) @@ -524,10 +516,6 @@ def test_constructor_coverage(self): with pytest.raises(TypeError, match=msg): date_range(start="1/1/2000", periods="foo", freq="D") - with pytest.raises(ValueError): - with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start="1/1/2000", end="1/10/2000") - with pytest.raises(TypeError): DatetimeIndex("1/1/2000") @@ -559,15 +547,15 @@ def test_constructor_coverage(self): # non-conforming msg = ( - "Inferred frequency None from passed values does not conform" - " to passed frequency D" + "Inferred frequency None from passed values does not conform " + "to passed frequency D" ) with pytest.raises(ValueError, match=msg): DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") msg = ( - "Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified" + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" ) with pytest.raises(ValueError, match=msg): date_range(start="2011-01-01", freq="b") @@ -734,22 +722,11 @@ def test_constructor_timestamp_near_dst(self): expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) - # TODO(GH-24559): Remove the xfail for the tz-aware case. @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) @pytest.mark.parametrize( "tz, dtype", - [ - pytest.param( - "US/Pacific", - "datetime64[ns, US/Pacific]", - marks=[ - pytest.mark.xfail(), - pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning"), - ], - ), - [None, "datetime64[ns]"], - ], + [("US/Pacific", "datetime64[ns, US/Pacific]"), (None, "datetime64[ns]")], ) def test_constructor_with_int_tz(self, klass, box, tz, dtype): # GH 20997, 20964 @@ -758,15 +735,10 @@ def test_constructor_with_int_tz(self, klass, box, tz, dtype): expected = klass([ts]) assert result == expected - # This is the desired future behavior - # Note: this xfail is not strict because the test passes with - # None or any of the UTC variants for tz_naive_fixture - @pytest.mark.xfail(reason="Future behavior", strict=False) - @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") def test_construction_int_rountrip(self, tz_naive_fixture): - # GH 12619 - # TODO(GH-24559): Remove xfail + # GH 12619, GH#24559 tz = tz_naive_fixture + result = 1293858000000000000 expected = DatetimeIndex([result], tz=tz).asi8[0] assert result == expected @@ -807,18 +779,15 @@ def test_construction_with_nat_and_tzlocal(self): expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) tm.assert_index_equal(result, expected) - def test_constructor_no_precision_warns(self): + def test_constructor_no_precision_raises(self): # GH-24753, GH-24739 - expected = pd.DatetimeIndex(["2000"], dtype="datetime64[ns]") - # we set the stacklevel for DatetimeIndex - with tm.assert_produces_warning(FutureWarning): - result = pd.DatetimeIndex(["2000"], dtype="datetime64") - tm.assert_index_equal(result, expected) + msg = "with no precision is not allowed" + with pytest.raises(ValueError, match=msg): + pd.DatetimeIndex(["2000"], dtype="datetime64") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = pd.Index(["2000"], dtype="datetime64") - tm.assert_index_equal(result, expected) + with pytest.raises(ValueError, match=msg): + pd.Index(["2000"], dtype="datetime64") def test_constructor_wrong_precision_raises(self): with pytest.raises(ValueError): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index ba7e3c9d38861..4d0beecbbf5d3 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, bdate_range, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import ( BDay, @@ -798,7 +798,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range("12/5/2011", "12/5/2011") rng2 = bdate_range("12/2/2011", "12/5/2011") - rng2.freq = BDay() + rng2._data.freq = BDay() # TODO: shouldn't this already be set? result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) @@ -855,7 +855,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range("12/5/2011", "12/5/2011", freq="C") rng2 = bdate_range("12/2/2011", "12/5/2011", freq="C") - rng2.freq = CDay() + rng2._data.freq = CDay() # TODO: shouldn't this already be set? result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) @@ -945,3 +945,19 @@ def test_range_with_millisecond_resolution(self, start_end): result = pd.date_range(start=start, end=end, periods=2, closed="left") expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) + + +def test_date_range_with_custom_holidays(): + # GH 30593 + freq = pd.offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = pd.date_range(start="2020-11-25 15:00", periods=4, freq=freq) + expected = pd.DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + freq=freq, + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 1776538a15fc2..ca18d6fbea11a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm randn = np.random.randn @@ -89,7 +89,7 @@ def test_week_of_month_frequency(self): def test_hash_error(self): index = date_range("20010101", periods=10) with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=f"unhashable type: '{type(index).__name__}'" ): hash(index) @@ -188,25 +188,6 @@ def test_string_index_series_name_converted(self): result = df.T["1/3/2000"] assert result.name == df.index[2] - def test_get_duplicates(self): - idx = DatetimeIndex( - [ - "2000-01-01", - "2000-01-02", - "2000-01-02", - "2000-01-03", - "2000-01-03", - "2000-01-04", - ] - ) - - with tm.assert_produces_warning(FutureWarning): - # Deprecated - see GH20239 - result = idx.get_duplicates() - - ex = DatetimeIndex(["2000-01-02", "2000-01-03"]) - tm.assert_index_equal(result, ex) - def test_argmin_argmax(self): idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"]) assert idx.argmin() == 1 @@ -412,15 +393,13 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. idx = pd.date_range("2000", periods=2) # M8[ns] by default - with tm.assert_produces_warning(None): - result = np.asarray(idx) + result = np.asarray(idx) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -429,15 +408,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" idx = pd.date_range("2000", periods=2, tz=tz) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(idx) + result = np.asarray(idx, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype="M8[ns]") + result = np.asarray(idx, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -445,8 +421,7 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -456,3 +431,15 @@ def test_to_frame_datetime_tz(self): result = idx.to_frame() expected = DataFrame(idx, index=idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "name"]) + def test_index_map(self, name): + # see GH20990 + count = 6 + index = pd.date_range("2018-01-01", periods=count, freq="M", name=name).map( + lambda x: (x.year, x.month) + ) + exp_index = pd.MultiIndex.from_product( + ((2018,), range(1, 7)), names=[name, name] + ) + tm.assert_index_equal(index, exp_index) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 2ff6853b98929..da1bd6f091d1a 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -2,7 +2,7 @@ import pytest from pandas import DatetimeIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 33a744cc25ca1..f34019e06fd5f 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_to_native_types(): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index cd5efc86320c2..4c600e510790a 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DatetimeIndex, Index, Timestamp, date_range, notna -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, CDay @@ -86,7 +86,9 @@ def test_dti_business_getitem(self): def test_dti_business_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END) - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -110,7 +112,9 @@ def test_dti_custom_getitem(self): def test_dti_custom_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END, freq="C") - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -132,9 +136,32 @@ def test_where_other(self): i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notna(i2), i2.values) + result = i.where(notna(i2), i2._values) tm.assert_index_equal(result, i2) + def test_where_invalid_dtypes(self): + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + + i2 = dti.copy() + i2 = Index([pd.NaT, pd.NaT] + dti[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-naive ndarray to tzaware DTI + dti.where(notna(i2), i2.values) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-aware DTI to tznaive DTI + dti.tz_localize(None).where(notna(i2), i2) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.tz_localize(None).to_period("D")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8) + def test_where_tz(self): i = pd.date_range("20130101", periods=3, tz="US/Eastern") result = i.where(notna(i)) @@ -317,7 +344,9 @@ def test_take_fill_value_with_timezone(self): class TestDatetimeIndex: - @pytest.mark.parametrize("null", [None, np.nan, pd.NaT]) + @pytest.mark.parametrize( + "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] + ) @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) @@ -326,6 +355,12 @@ def test_insert_nat(self, tz, null): res = idx.insert(0, null) tm.assert_index_equal(res, expected) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.timedelta64("NaT")) + def test_insert(self): idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") @@ -403,9 +438,9 @@ def test_insert(self): # see gh-7299 idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, pd.Timestamp("2000-01-04")) - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) @@ -457,7 +492,7 @@ def test_insert(self): def test_delete(self): idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") - # prserve freq + # preserve freq expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") @@ -511,7 +546,7 @@ def test_delete(self): def test_delete_slice(self): idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") - # prserve freq + # preserve freq expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") @@ -582,6 +617,23 @@ def test_delete_slice(self): assert result.freq == expected.freq assert result.tz == expected.tz + def test_get_value(self): + # specifically make sure we have test for np.datetime64 key + dti = pd.date_range("2016-01-01", periods=3) + + arr = np.arange(6, 8) + + key = dti[1] + + result = dti.get_value(arr, key) + assert result == 7 + + result = dti.get_value(arr, key.to_pydatetime()) + assert result == 7 + + result = dti.get_value(arr, key.to_datetime64()) + assert result == 7 + def test_get_loc(self): idx = pd.date_range("2000-01-01", periods=3) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index ab3107a0798e5..340f53b2868bd 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,4 +1,5 @@ import calendar +from datetime import datetime import locale import unicodedata @@ -6,8 +7,8 @@ import pytest import pandas as pd -from pandas import DatetimeIndex, Index, Timestamp, date_range, datetime, offsets -import pandas.util.testing as tm +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets +import pandas._testing as tm class TestTimeSeries: @@ -200,7 +201,6 @@ def test_datetimeindex_accessors(self): assert len(dti.is_quarter_end) == 365 assert len(dti.is_year_start) == 365 assert len(dti.is_year_end) == 365 - assert len(dti.weekday_name) == 365 dti.name = "name" @@ -339,11 +339,8 @@ def test_datetime_name_accessors(self, time_locale): ] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() - assert dti.weekday_name[day] == eng_name assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert ts.weekday_name == eng_name assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.day_name(locale=time_locale)[-1]) diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 6d94319b33b02..3399c8eaf6750 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 2ec267c66091b..ecd4ace705e9e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -16,8 +16,8 @@ bdate_range, date_range, ) -from pandas.tests.test_base import Ops -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.tests.base.test_ops import Ops from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour @@ -41,9 +41,9 @@ def test_ops_properties_basic(self): # sanity check that the behavior didn't change # GH#7206 - msg = "'Series' object has no attribute '{}'" for op in ["year", "day", "second", "weekday"]: - with pytest.raises(AttributeError, match=msg.format(op)): + msg = f"'Series' object has no attribute '{op}'" + with pytest.raises(AttributeError, match=msg): getattr(self.dt_series, op) # attribute access should still work! @@ -413,12 +413,12 @@ def test_freq_setter(self, values, freq, tz): idx = DatetimeIndex(values, tz=tz) # can set to an offset, converting from string if necessary - idx.freq = freq + idx._data.freq = freq assert idx.freq == freq assert isinstance(idx.freq, ABCDateOffset) # can reset to None - idx.freq = None + idx._data.freq = None assert idx.freq is None def test_freq_setter_errors(self): @@ -431,23 +431,11 @@ def test_freq_setter_errors(self): "passed frequency 5D" ) with pytest.raises(ValueError, match=msg): - idx.freq = "5D" + idx._data.freq = "5D" # setting with non-freq string with pytest.raises(ValueError, match="Invalid frequency"): - idx.freq = "foo" - - def test_offset_deprecated(self): - # GH 20716 - idx = pd.DatetimeIndex(["20180101", "20180102"]) - - # getter deprecated - with tm.assert_produces_warning(FutureWarning): - idx.offset - - # setter deprecated - with tm.assert_produces_warning(FutureWarning): - idx.offset = BDay() + idx._data.freq = "foo" class TestBusinessDatetimeIndex: @@ -549,8 +537,6 @@ def test_shift_periods(self): idx = pd.date_range(start=START, end=END, periods=3) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) def test_pickle_unpickle(self): unpickled = tm.round_trip_pickle(self.rng) diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 51aeb40744c3a..e30cc4449e01e 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -16,8 +16,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm class TestSlicing: @@ -274,7 +274,7 @@ def test_partial_slicing_dataframe(self): result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == expected - msg = r"^'{}'$".format(ts_string) + msg = fr"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df[ts_string] @@ -302,7 +302,7 @@ def test_partial_slicing_dataframe(self): result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == 2 - msg = r"^'{}'$".format(ts_string) + msg = fr"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df[ts_string] @@ -311,7 +311,7 @@ def test_partial_slicing_dataframe(self): for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: ts = index[1] + Timedelta("1 " + res) ts_string = ts.strftime(fmt) - msg = r"^'{}'$".format(ts_string) + msg = fr"^'{ts_string}'$" with pytest.raises(KeyError, match=msg): df["a"][ts_string] with pytest.raises(KeyError, match=msg): diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 00310f4fba7c7..84eee2419f0b8 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import to_offset @@ -50,18 +50,13 @@ def test_dti_date_out_of_range(self, data): "is_quarter_end", "is_year_start", "is_year_end", - "weekday_name", ], ) def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week idx = tm.makeDateIndex(100) expected = getattr(idx, field)[-1] - if field == "weekday_name": - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = getattr(Timestamp(idx[-1]), field) - else: - result = getattr(Timestamp(idx[-1]), field) + result = getattr(Timestamp(idx[-1]), field) assert result == expected def test_dti_timestamp_freq_fields(self): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 67fc70c17d7bc..78188c54b1d85 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -16,7 +16,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd @@ -157,12 +157,27 @@ def test_union_bug_4564(self, sort): def test_union_freq_both_none(self, sort): # GH11086 expected = bdate_range("20150101", periods=10) - expected.freq = None + expected._data.freq = None result = expected.union(expected, sort=sort) tm.assert_index_equal(result, expected) assert result.freq is None + def test_union_freq_infer(self): + # When taking the union of two DatetimeIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # TimedeltaIndex behavior. + dti = pd.date_range("2016-01-01", periods=5) + left = dti[[0, 1, 3, 4]] + right = dti[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, dti) + assert result.freq == "D" + def test_union_dataframe_index(self): rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") s1 = Series(np.random.randn(len(rng1)), rng1) diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_shift.py similarity index 90% rename from pandas/tests/indexes/datetimes/test_arithmetic.py rename to pandas/tests/indexes/datetimes/test_shift.py index 4851dd5a55c1e..1c87995931c62 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -7,10 +7,10 @@ import pandas as pd from pandas import DatetimeIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm -class TestDatetimeIndexArithmetic: +class TestDatetimeIndexShift: # ------------------------------------------------------------- # DatetimeIndex.shift is used in integer addition @@ -69,17 +69,11 @@ def test_dti_shift_freqs(self): def test_dti_shift_int(self): rng = date_range("1/1/2000", periods=20) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng + 5 - + result = rng + 5 * rng.freq expected = rng.shift(5) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng - 5 - + result = rng - 5 * rng.freq expected = rng.shift(-5) tm.assert_index_equal(result, expected) @@ -100,9 +94,9 @@ def test_dti_shift_localized(self, tzstr): def test_dti_shift_across_dst(self): # GH 8616 idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") - s = Series(index=idx[:-1]) + s = Series(index=idx[:-1], dtype=object) result = s.shift(freq="H") - expected = Series(index=idx[1:]) + expected = Series(index=idx[1:], dtype=object) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 059dbb00019d8..1505ac1dff29c 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -22,7 +22,7 @@ isna, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class FixedOffset(tzinfo): @@ -323,13 +323,9 @@ def test_dti_tz_localize_nonexistent_raise_coerce(self): index.tz_localize(tz=tz) with pytest.raises(pytz.NonExistentTimeError): - with tm.assert_produces_warning(FutureWarning): - index.tz_localize(tz=tz, errors="raise") + index.tz_localize(tz=tz, nonexistent="raise") - with tm.assert_produces_warning( - FutureWarning, clear=FutureWarning, check_stacklevel=False - ): - result = index.tz_localize(tz=tz, errors="coerce") + result = index.tz_localize(tz=tz, nonexistent="NaT") test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] dti = to_datetime(test_times, utc=True) expected = dti.tz_convert("US/Eastern") @@ -704,20 +700,6 @@ def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): with pytest.raises(ValueError, match=msg): dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_dti_tz_localize_errors_deprecation(self): - # GH 22644 - tz = "Europe/Warsaw" - n = 60 - dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with pytest.raises(ValueError): - dti.tz_localize(tz, errors="foo") - # make sure errors='coerce' gets mapped correctly to nonexistent - result = dti.tz_localize(tz, errors="coerce") - expected = dti.tz_localize(tz, nonexistent="NaT") - tm.assert_index_equal(result, expected) - # ------------------------------------------------------------- # DatetimeIndex.normalize diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 4e5d624eba844..fe65653ba6545 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1,6 +1,7 @@ """ test to_datetime """ import calendar +from collections import deque from datetime import datetime, time import locale @@ -29,9 +30,9 @@ isna, to_datetime, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools -import pandas.util.testing as tm class TestTimeConversionFormats: @@ -101,6 +102,29 @@ def test_to_datetime_format_YYYYMMDD(self, cache): expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "input_s", + [ + # Null values with Strings + ["19801222", "20010112", None], + ["19801222", "20010112", np.nan], + ["19801222", "20010112", pd.NaT], + ["19801222", "20010112", "NaT"], + # Null values with Integers + [19801222, 20010112, None], + [19801222, 20010112, np.nan], + [19801222, 20010112, pd.NaT], + [19801222, 20010112, "NaT"], + ], + ) + def test_to_datetime_format_YYYYMMDD_with_none(self, input_s): + # GH 30011 + # format='%Y%m%d' + # with None + expected = Series([Timestamp("19801222"), Timestamp("20010112"), pd.NaT]) + result = Series(pd.to_datetime(input_s, format="%Y%m%d")) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "input_s, expected", [ @@ -592,8 +616,8 @@ def test_to_datetime_tz(self, cache): pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), ] msg = ( - "Tz-aware datetime.datetime cannot be converted to datetime64" - " unless utc=True" + "Tz-aware datetime.datetime cannot be " + "converted to datetime64 unless utc=True" ) with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) @@ -696,13 +720,11 @@ def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) + @td.skip_if_no("psycopg2") def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 - try: - import psycopg2 - except ImportError: - pytest.skip("no psycopg2 installed") + import psycopg2 # misc cases tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) @@ -838,7 +860,7 @@ def test_datetime_invalid_index(self, values, format, infer): @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) - @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) + @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index, deque]) def test_to_datetime_cache(self, utc, format, constructor): date = "20130101 00:00:00" test_dates = [date] * 10 ** 5 @@ -849,6 +871,24 @@ def test_to_datetime_cache(self, utc, format, constructor): tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "listlike", + [ + (deque([pd.Timestamp("2010-06-02 09:30:00")] * 51)), + ([pd.Timestamp("2010-06-02 09:30:00")] * 51), + (tuple([pd.Timestamp("2010-06-02 09:30:00")] * 51)), + ], + ) + def test_no_slicing_errors_in_should_cache(self, listlike): + # GH 29403 + assert tools.should_cache(listlike) is True + + def test_to_datetime_from_deque(self): + # GH 29403 + result = pd.to_datetime(deque([pd.Timestamp("2010-06-02 09:30:00")] * 51)) + expected = pd.to_datetime([pd.Timestamp("2010-06-02 09:30:00")] * 51) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) def test_to_datetime_cache_series(self, utc, format): @@ -921,22 +961,6 @@ def test_iso_8601_strings_with_same_offset(self): result = DatetimeIndex([ts_str] * 2) tm.assert_index_equal(result, expected) - def test_iso_8601_strings_same_offset_no_box(self): - # GH 22446 - data = ["2018-01-04 09:01:00+09:00", "2018-01-04 09:02:00+09:00"] - - with tm.assert_produces_warning(FutureWarning): - result = pd.to_datetime(data, box=False) - - expected = np.array( - [ - datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), - datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)), - ], - dtype=object, - ) - tm.assert_numpy_array_equal(result, expected) - def test_iso_8601_strings_with_different_offsets(self): # GH 17697, 11736 ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] @@ -1024,16 +1048,6 @@ def test_timestamp_utc_true(self, ts, expected): result = to_datetime(ts, utc=True) assert result == expected - def test_to_datetime_box_deprecated(self): - expected = np.datetime64("2018-09-09") - - # Deprecated - see GH24416 - with tm.assert_produces_warning(FutureWarning): - pd.to_datetime(expected, box=False) - - result = pd.to_datetime(expected).to_datetime64() - assert result == expected - @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 @@ -1045,7 +1059,7 @@ class TestToDatetimeUnit: @pytest.mark.parametrize("cache", [True, False]) def test_unit(self, cache): # GH 11758 - # test proper behavior with erros + # test proper behavior with errors with pytest.raises(ValueError): to_datetime([1], unit="D", format="%Y%m%d", cache=cache) @@ -1284,7 +1298,7 @@ def test_dataframe(self, cache): tm.assert_series_equal(result, expected) # extra columns - msg = "extra keys have been passed to the datetime assemblage: " r"\[foo\]" + msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" with pytest.raises(ValueError, match=msg): df2 = df.copy() df2["foo"] = 1 @@ -1345,16 +1359,6 @@ def test_dataframe_dtypes(self, cache): with pytest.raises(ValueError): to_datetime(df, cache=cache) - def test_dataframe_box_false(self): - # GH 23760 - df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) - - with tm.assert_produces_warning(FutureWarning): - result = pd.to_datetime(df, box=False) - - expected = np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") - tm.assert_numpy_array_equal(result, expected) - def test_dataframe_utc_true(self): # GH 23760 df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) @@ -2287,3 +2291,25 @@ def test_should_cache_errors(unique_share, check_count, err_message): with pytest.raises(AssertionError, match=err_message): tools.should_cache(arg, unique_share, check_count) + + +def test_nullable_integer_to_datetime(): + # Test for #30050 + ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = ser.astype("Int64") + ser_copy = ser.copy() + + res = pd.to_datetime(ser, unit="ns") + + expected = pd.Series( + [ + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + np.datetime64("NaT"), + np.datetime64("2043-01-25 23:56:49.213693952"), + np.datetime64("NaT"), + ] + ) + tm.assert_series_equal(res, expected) + # Check that ser isn't mutated + tm.assert_series_equal(ser, ser_copy) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 708cd8a4579e8..2b1742d58b77e 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -12,7 +12,7 @@ Timestamp, interval_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class Base: diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 339bdaf79c690..d8c2ba8413cfb 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -2,8 +2,8 @@ import pytest from pandas import IntervalIndex, Series, date_range +import pandas._testing as tm from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class TestBase(Base): @@ -79,3 +79,10 @@ def test_where(self, closed, klass): expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + idx[:, None] diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_constructors.py similarity index 99% rename from pandas/tests/indexes/interval/test_construction.py rename to pandas/tests/indexes/interval/test_constructors.py index 98c1f7c6c2a8a..13a45df743cf5 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -19,9 +19,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(params=[None, "foo"]) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index dcc0c818182ab..7acf5c1e0906c 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, IntervalIndex, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndexRendering: diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 05d8aee2a8fb7..1bfc58733a110 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -3,9 +3,16 @@ import numpy as np import pytest -from pandas import Interval, IntervalIndex, Timedelta, date_range, timedelta_range +from pandas import ( + CategoricalIndex, + Interval, + IntervalIndex, + Timedelta, + date_range, + timedelta_range, +) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm class TestGetLoc: @@ -231,6 +238,25 @@ def test_get_indexer_length_one_interval(self, size, closed): expected = np.array([0] * size, dtype="intp") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "target", + [ + IntervalIndex.from_tuples([(7, 8), (1, 2), (3, 4), (0, 1)]), + IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4), np.nan]), + IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)], closed="both"), + [-1, 0, 0.5, 1, 2, 2.5, np.nan], + ["foo", "foo", "bar", "baz"], + ], + ) + def test_get_indexer_categorical(self, target, ordered_fixture): + # GH 30063: categorical and non-categorical results should be consistent + index = IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]) + categorical_target = CategoricalIndex(target, ordered=ordered_fixture) + + result = index.get_indexer(categorical_target) + expected = index.get_indexer(target) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "tuples, closed", [ @@ -323,8 +349,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) @@ -332,8 +358,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2)) @@ -343,8 +369,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(end=Interval(0, 2)) @@ -352,8 +378,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) @@ -405,8 +431,8 @@ def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): with pytest.raises( KeyError, match=( - "'can only get slices from an IntervalIndex if bounds are" - " non-overlapping and all monotonic increasing or decreasing'" + "'can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing'" ), ): index.slice_locs(start, stop) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 73eacd8c4856e..47a0ba7fe0f21 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -17,8 +17,8 @@ notna, timedelta_range, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) @@ -105,11 +105,11 @@ def test_with_nans(self, closed): assert index.hasnans is False result = index.isna() - expected = np.repeat(False, len(index)) + expected = np.zeros(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) result = index.notna() - expected = np.repeat(True, len(index)) + expected = np.ones(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) @@ -586,8 +586,8 @@ def test_missing_values(self, closed): assert idx.equals(idx2) msg = ( - "missing values must be missing in the same location both left" - " and right sides" + "missing values must be missing in the same location both left " + "and right sides" ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( @@ -836,17 +836,6 @@ def test_nbytes(self): expected = 64 # 4 * 8 * 2 assert result == expected - def test_itemsize(self): - # GH 19209 - left = np.arange(0, 4, dtype="i8") - right = np.arange(1, 5, dtype="i8") - expected = 16 # 8 * 2 - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = IntervalIndex.from_arrays(left, right).itemsize - - assert result == expected - @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, name, closed, new_closed): # GH 21670 diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index b102444b4ec9c..2f28c33a3bbc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -15,7 +15,7 @@ interval_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day @@ -84,7 +84,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.isAnchored() and tz is None: + if not breaks.freq.is_anchored() and tz is None: # matches expected only for non-anchored offsets and tz naive # (anchored/DST transitions cause unequal spacing in expected) result = interval_range( diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 87f9eaa209277..476ec1dd10b4b 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -6,7 +6,7 @@ from pandas._libs.interval import IntervalTree from pandas import compat -import pandas.util.testing as tm +import pandas._testing as tm def skipif_32bit(param): @@ -20,9 +20,7 @@ def skipif_32bit(param): return pytest.param(param, marks=marks) -@pytest.fixture( - scope="class", params=["int32", "int64", "float32", "float64", "uint64"] -) +@pytest.fixture(scope="class", params=["int64", "float64", "uint64"]) def dtype(request): return request.param @@ -39,12 +37,9 @@ def leaf_size(request): @pytest.fixture( params=[ np.arange(5, dtype="int64"), - np.arange(5, dtype="int32"), np.arange(5, dtype="uint64"), np.arange(5, dtype="float64"), - np.arange(5, dtype="float32"), np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), - np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), ] ) def tree(request, leaf_size): @@ -53,18 +48,6 @@ def tree(request, leaf_size): class TestIntervalTree: - def test_get_loc(self, tree): - result = tree.get_loc(1) - expected = np.array([0], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - result = np.sort(tree.get_loc(2)) - expected = np.array([0, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - with pytest.raises(KeyError, match="-1"): - tree.get_loc(-1) - def test_get_indexer(self, tree): result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) expected = np.array([0, 4, -1], dtype="intp") @@ -75,6 +58,18 @@ def test_get_indexer(self, tree): ): tree.get_indexer(np.array([3.0])) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self, tree): indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) @@ -94,14 +89,26 @@ def test_get_indexer_non_unique(self, tree): expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + def test_duplicates(self, dtype): left = np.array([0, 0, 0], dtype=dtype) tree = IntervalTree(left, left + 1) - result = np.sort(tree.get_loc(0.5)) - expected = np.array([0, 1, 2], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - with pytest.raises( KeyError, match="'indexer does not intersect a unique set of intervals'" ): @@ -116,17 +123,6 @@ def test_duplicates(self, dtype): expected = np.array([], dtype="intp") tm.assert_numpy_array_equal(result, expected) - def test_get_loc_closed(self, closed): - tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), (1, tree.open_right)]: - if errors: - with pytest.raises(KeyError, match=str(p)): - tree.get_loc(p) - else: - result = tree.get_loc(p) - expected = np.array([0], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize( "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] ) @@ -147,25 +143,25 @@ def test_get_indexer_closed(self, closed, leaf_size): @pytest.mark.parametrize( "left, right, expected", [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), ], ) - @pytest.mark.parametrize("order", map(list, permutations(range(3)))) + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) def test_is_overlapping(self, closed, order, left, right, expected): # GH 23309 tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping assert result is expected - @pytest.mark.parametrize("order", map(list, permutations(range(3)))) + @pytest.mark.parametrize("order", (list(x) for x in permutations(range(3)))) def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 - left, right = np.arange(3), np.arange(1, 4) + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping expected = closed == "both" @@ -188,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right): @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 - left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 tree = IntervalTree(left, right) # pivot should be average of left/right medians diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 89e733c30b1e3..3246ac6bafde9 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -2,7 +2,7 @@ import pytest from pandas import Index, IntervalIndex, Timestamp, interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 36152bc4b60cd..ac1e0893683d1 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm def test_shift(idx): @@ -277,7 +277,7 @@ def test_map(idx): def test_map_dictlike(idx, mapper): if isinstance(idx, (pd.CategoricalIndex, pd.IntervalIndex)): - pytest.skip("skipping tests for {}".format(type(idx))) + pytest.skip(f"skipping tests for {type(idx)}") identity = mapper(idx.values, idx) @@ -330,13 +330,13 @@ def test_numpy_ufuncs(idx, func): if _np_version_under1p17: expected_exception = AttributeError - msg = "'tuple' object has no attribute '{}'".format(func.__name__) + msg = f"'tuple' object has no attribute '{func.__name__}'" else: expected_exception = TypeError msg = ( "loop of ufunc does not support argument 0 of type tuple which" - " has no callable {} method" - ).format(func.__name__) + f" has no callable {func.__name__} method" + ) with pytest.raises(expected_exception, match=msg): func(idx) @@ -348,9 +348,9 @@ def test_numpy_ufuncs(idx, func): ) def test_numpy_type_funcs(idx, func): msg = ( - "ufunc '{}' not supported for the input types, and the inputs" - " could not be safely coerced to any supported types according to" - " the casting rule ''safe''" - ).format(func.__name__) + f"ufunc '{func.__name__}' not supported for the input types, and the inputs " + "could not be safely coerced to any supported types according to " + "the casting rule ''safe''" + ) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 93fdeb10b849a..29908537fbe59 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype -import pandas.util.testing as tm +import pandas._testing as tm def test_astype(idx): diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index b02f87dc4aacb..d92cff1e10496 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -2,7 +2,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_numeric_compat(idx): diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructors.py similarity index 96% rename from pandas/tests/indexes/multi/test_constructor.py rename to pandas/tests/indexes/multi/test_constructors.py index c32adf275ac98..2c4b3ce04f96d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import numpy as np import pytest @@ -9,7 +7,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_constructor_single_level(): @@ -67,8 +65,8 @@ def test_constructor_mismatched_codes_levels(idx): MultiIndex(levels=levels, codes=codes) length_error = ( - r"On level 0, code max \(3\) >= length of level \(1\)\." - " NOTE: this index is in an inconsistent state" + r"On level 0, code max \(3\) >= length of level \(1\)\. " + "NOTE: this index is in an inconsistent state" ) label_error = r"Unequal code lengths: \[4, 2\]" code_value_error = r"On level 0, code value \(-2\) < -1" @@ -128,18 +126,6 @@ def test_na_levels(): tm.assert_index_equal(result, expected) -def test_labels_deprecated(idx): - # GH23752 - with tm.assert_produces_warning(FutureWarning): - MultiIndex( - levels=[["foo", "bar", "baz", "qux"]], - labels=[[0, 1, 2, 3]], - names=["first"], - ) - with tm.assert_produces_warning(FutureWarning): - idx.labels - - def test_copy_in_constructor(): levels = np.array(["a", "b", "c"]) codes = np.array([1, 1, 2, 0, 0, 1, 1]) @@ -591,6 +577,17 @@ def test_from_product_respects_none_names(): tm.assert_index_equal(result, expected) +def test_from_product_readonly(): + # GH#15286 passing read-only array to from_product + a = np.array(range(3)) + b = ["a", "b"] + expected = MultiIndex.from_product([a, b]) + + a.setflags(write=False) + result = MultiIndex.from_product([a, b]) + tm.assert_index_equal(result, expected) + + def test_create_index_existing_name(idx): # GH11193, when an existing index is passed, and a new name is not @@ -609,12 +606,11 @@ def test_create_index_existing_name(idx): ("qux", "two"), ], dtype="object", - ), - names=["foo", "bar"], + ) ) tm.assert_index_equal(result, expected) - result = pd.Index(index, names=["A", "B"]) + result = pd.Index(index, name="A") expected = Index( Index( [ @@ -627,7 +623,7 @@ def test_create_index_existing_name(idx): ], dtype="object", ), - names=["A", "B"], + name="A", ) tm.assert_index_equal(result, expected) @@ -667,14 +663,12 @@ def test_from_frame_error(non_frame): def test_from_frame_dtype_fidelity(): # GH 22420 df = pd.DataFrame( - OrderedDict( - [ - ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), - ("a", [1, 1, 1, 2, 2, 2]), - ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), - ("c", ["x", "x", "y", "z", "x", "y"]), - ] - ) + { + "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } ) original_dtypes = df.dtypes.to_dict() diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 64d2859cd13db..49aa63210cd5e 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_contains_top_level(): @@ -98,3 +98,27 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match="'Level C not found'"): idx.isin(vals_1, level="C") + + +def test_contains_with_missing_value(): + # issue 19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None,), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 3fc73dd05bc72..8956e6ed4996f 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,11 +1,9 @@ -from collections import OrderedDict - import numpy as np import pytest import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_tolist(idx): @@ -107,14 +105,12 @@ def test_to_frame_dtype_fidelity(): original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} expected_df = pd.DataFrame( - OrderedDict( - [ - ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), - ("a", [1, 1, 1, 2, 2, 2]), - ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), - ("c", ["x", "x", "y", "z", "x", "y"]), - ] - ) + { + "dates": pd.date_range("19910905", periods=6, tz="US/Eastern"), + "a": [1, 1, 1, 2, 2, 2], + "b": pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + "c": ["x", "x", "y", "z", "x", "y"], + } ) df = mi.to_frame(index=False) df_dtypes = df.dtypes.to_dict() @@ -133,59 +129,8 @@ def test_to_frame_resulting_column_order(): assert result == expected -def test_to_hierarchical(): - index = MultiIndex.from_tuples([(1, "one"), (1, "two"), (2, "one"), (2, "two")]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = index.to_hierarchical(3) - expected = MultiIndex( - levels=[[1, 2], ["one", "two"]], - codes=[ - [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1], - ], - ) - tm.assert_index_equal(result, expected) - assert result.names == index.names - - # K > 1 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = index.to_hierarchical(3, 2) - expected = MultiIndex( - levels=[[1, 2], ["one", "two"]], - codes=[ - [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - ], - ) - tm.assert_index_equal(result, expected) - assert result.names == index.names - - # non-sorted - index = MultiIndex.from_tuples( - [(2, "c"), (1, "b"), (2, "a"), (2, "b")], names=["N1", "N2"] - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = index.to_hierarchical(2) - expected = MultiIndex.from_tuples( - [ - (2, "c"), - (2, "c"), - (1, "b"), - (1, "b"), - (2, "a"), - (2, "a"), - (2, "b"), - (2, "b"), - ], - names=["N1", "N2"], - ) - tm.assert_index_equal(result, expected) - assert result.names == index.names - - def test_roundtrip_pickle_with_tz(): - return + return # FIXME: this can't be right? # GH 8367 # round-trip of timezone @@ -198,7 +143,7 @@ def test_roundtrip_pickle_with_tz(): def test_pickle(indices): - return + return # FIXME: this can't be right? unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 2668197535fcc..1acc65aef8b8a 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -3,7 +3,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_multiindex_copied(copy, original): @@ -35,12 +35,6 @@ def test_shallow_copy(idx): assert_multiindex_copied(i_copy, idx) -def test_labels_deprecated(idx): - # GH23752 - with tm.assert_produces_warning(FutureWarning): - idx.copy(labels=idx.codes) - - def test_view(idx): i_view = idx.view() assert_multiindex_copied(i_view, idx) diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 2c24c5bd57085..b909025b3f2f9 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_drop(idx): @@ -108,8 +108,8 @@ def test_droplevel_list(): assert dropped.equals(expected) msg = ( - "Cannot remove 3 levels from an index with 3 levels: at least one" - " level must be left" + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left" ) with pytest.raises(ValueError, match=msg): index[:2].droplevel(["one", "two", "three"]) @@ -139,3 +139,52 @@ def test_drop_not_lexsorted(): tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 518bd093b23b1..93e1de535835f 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -6,7 +6,7 @@ from pandas._libs import hashtable from pandas import DatetimeIndex, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("names", [None, ["first", "second"]]) @@ -251,16 +251,13 @@ def test_duplicated_large(keep): tm.assert_numpy_array_equal(result, expected) -def test_get_duplicates(): +def test_duplicated2(): + # TODO: more informative test name # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates - with tm.assert_produces_warning(FutureWarning): - # Deprecated - see GH20239 - assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) for n in range(1, 6): # 1st level shape @@ -274,10 +271,6 @@ def test_get_duplicates(): assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates - with tm.assert_produces_warning(FutureWarning): - # Deprecated - see GH20239 - assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal( mi.duplicated(), np.zeros(len(mi), dtype="bool") ) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index f61ba0132ab97..063ede028add7 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_equals(idx): @@ -146,7 +146,10 @@ def test_identical(idx): assert mi.identical(mi2) mi3 = Index(mi.tolist(), names=mi.names) - mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False) + msg = r"Unexpected keyword arguments {'names'}" + with pytest.raises(TypeError, match=msg): + Index(mi.tolist(), names=mi.names, tupleize_cols=False) + mi4 = Index(mi.tolist(), tupleize_cols=False) assert mi.identical(mi3) assert not mi.identical(mi4) assert mi.equals(mi4) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index a7f58b9ea78bd..75f23fb2f32ba 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -4,14 +4,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm - - -def test_dtype_str(indices): - with tm.assert_produces_warning(FutureWarning): - dtype = indices.dtype_str - assert isinstance(dtype, str) - assert dtype == str(indices.dtype) +import pandas._testing as tm def test_format(idx): diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 5ab817d8468c3..074072ae581b2 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import CategoricalIndex, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_matching(actual, expected, check_dtype=False): @@ -306,27 +306,6 @@ def test_set_codes(idx): result.set_codes(codes=new_codes, level=1, inplace=True) assert result.equals(expected) - with tm.assert_produces_warning(FutureWarning): - ind.set_codes(labels=new_codes, level=1) - - -def test_set_labels_deprecated(): - # GH23752 - ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) - new_labels = range(129, -1, -1) - expected = pd.MultiIndex.from_tuples([(0, i) for i in new_labels]) - - # [w/o mutation] - with tm.assert_produces_warning(FutureWarning): - result = ind.set_labels(labels=new_labels, level=1) - assert result.equals(expected) - - # [w/ mutation] - result = ind.copy() - with tm.assert_produces_warning(FutureWarning): - result.set_labels(labels=new_labels, level=1, inplace=True) - assert result.equals(expected) - def test_set_levels_codes_names_bad_input(idx): levels, codes = idx.levels, idx.codes diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 9ef2a77205acc..ad6f06d065150 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -12,8 +12,8 @@ MultiIndex, date_range, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm def test_slice_locs_partial(idx): @@ -437,3 +437,91 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) + + +def test_get_loc_with_values_including_missing_values(): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + +@pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], +) +def test_get_indexer_with_missing_value(index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo, kind="loc") + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], +) +def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index 472a404c2a8ef..f2ec15e0af88c 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import IntervalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_labels_dtypes(): @@ -49,9 +49,8 @@ def test_values_multiindex_datetimeindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(10 ** 18, 10 ** 18 + 5) naive = pd.DatetimeIndex(ints) - # TODO(GH-24559): Remove the FutureWarning - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - aware = pd.DatetimeIndex(ints, tz="US/Central") + + aware = pd.DatetimeIndex(ints, tz="US/Central") idx = pd.MultiIndex.from_arrays([naive, aware]) result = idx.values @@ -210,7 +209,7 @@ def test_metadata_immutable(idx): # ditto for labels with pytest.raises(TypeError, match=mutable_regex): codes[0] = codes[0] - with pytest.raises(TypeError, match=mutable_regex): + with pytest.raises(ValueError, match="assignment destination is read-only"): codes[0][0] = codes[0][0] # and for names names = idx.names @@ -253,9 +252,7 @@ def test_rangeindex_fallback_coercion_bug(): def test_hash_error(indices): index = indices - with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) - ): + with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): hash(indices) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 42d8cf761842e..062fb92c44552 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -87,3 +87,19 @@ def test_join_self_unique(idx, join_type): if idx.is_unique: joined = idx.join(idx, how=join_type) assert (idx == joined).all() + + +def test_join_multi_wrong_order(): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"]) + + join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False) + + exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(midx1, join_idx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 15bbd2ce97c3c..a17e1e9928bff 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm def test_fillna(idx): @@ -42,9 +42,9 @@ def test_fillna(idx): values[1] = np.nan if isinstance(index, PeriodIndex): - idx = index.__class__(values, freq=index.freq) + idx = type(index)(values, freq=index.freq) else: - idx = index.__class__(values) + idx = type(index)(values) expected = np.array([False] * len(idx), dtype=bool) expected[1] = True @@ -101,7 +101,7 @@ def test_nulls(idx): idx.isna() -@pytest.mark.xfail +@pytest.mark.xfail(reason="isna is not defined for MultiIndex") def test_hasnans_isnans(idx): # GH 11343, added tests for hasnans / isnans index = idx.copy() @@ -115,7 +115,7 @@ def test_hasnans_isnans(idx): values = index.values values[1] = np.nan - index = idx.__class__(values) + index = type(idx)(values) expected = np.array([False] * len(index), dtype=bool) expected[1] = True diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 5c3a48c9dd481..479b5ef0211a0 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def check_level_names(index, names): @@ -124,3 +124,20 @@ def test_get_names_from_levels(): assert idx.levels[0].name == "a" assert idx.levels[1].name == "b" + + +def test_setting_names_from_levels_raises(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[0].name = "foo" + + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[1].name = "foo" + + new = pd.Series(1, index=idx.levels[0]) + with pytest.raises(RuntimeError, match="set_names"): + new.index.name = "bar" + + assert pd.Index._no_setting_name is False + assert pd.Int64Index._no_setting_name is False + assert pd.RangeIndex._no_setting_name is False diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 5db1296d828ca..b00018d2ceb69 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_partial_string_timestamp_multiindex(): diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 513efa8941de8..ceb14aa82a76c 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_reindex(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 37df420e9ea2e..2e39c714ca7af 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_insert(idx): diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_setops.py similarity index 99% rename from pandas/tests/indexes/multi/test_set_ops.py rename to pandas/tests/indexes/multi/test_setops.py index 835784054261e..841e3b3f17b38 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("case", [0.5, "xxx"]) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 3dee1dbecf3ba..277bd79cfe953 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_sortlevel(idx): @@ -120,7 +120,7 @@ def test_unsortedindex(): def test_unsortedindex_doc_examples(): - # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa dfm = DataFrame( {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} ) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index fd6013ab5ae08..88e800d66f3ad 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, PeriodIndex, Series, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndex: diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index fa57ec2b1f7ca..ec386dd9dd11c 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, Int64Index, NaT, Period, PeriodIndex, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexAsType: diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_constructors.py similarity index 85% rename from pandas/tests/indexes/period/test_construction.py rename to pandas/tests/indexes/period/test_constructors.py index 8c75fbbae7de3..27ee915e48e5c 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -7,14 +7,11 @@ import pandas as pd from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import PeriodArray class TestPeriodIndex: - def setup_method(self, method): - pass - def test_construction_base_constructor(self): # GH 13664 arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] @@ -32,13 +29,34 @@ def test_construction_base_constructor(self): pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) ) + def test_base_constructor_with_period_dtype(self): + dtype = PeriodDtype("D") + values = ["2011-01-01", "2012-03-04", "2014-05-01"] + result = pd.Index(values, dtype=dtype) + + expected = pd.PeriodIndex(values, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "values_constructor", [list, np.array, PeriodIndex, PeriodArray._from_sequence] + ) + def test_index_object_dtype(self, values_constructor): + # Index(periods, dtype=object) is an Index (not an PeriodIndex) + periods = [ + pd.Period("2011-01", freq="M"), + pd.NaT, + pd.Period("2011-03", freq="M"), + ] + values = values_constructor(periods) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + def test_constructor_use_start_freq(self): # GH #1118 p = Period("4/2/2012", freq="B") - with tm.assert_produces_warning(FutureWarning): - index = PeriodIndex(start=p, periods=10) expected = period_range(start="4/2/2012", periods=10, freq="B") - tm.assert_index_equal(index, expected) index = period_range(start=p, periods=10) tm.assert_index_equal(index, expected) @@ -68,12 +86,6 @@ def test_constructor_field_arrays(self): with pytest.raises(ValueError, match=msg): PeriodIndex(year=years, month=months, freq="2M") - msg = "Can either instantiate from fields or endpoints, but not both" - with pytest.raises(ValueError, match=msg): - PeriodIndex( - year=years, month=months, freq="M", start=Period("2007-01", freq="M") - ) - years = [2007, 2007, 2007] months = [1, 2, 3] idx = PeriodIndex(year=years, month=months, freq="M") @@ -115,26 +127,6 @@ def test_constructor_invalid_quarters(self): PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") def test_constructor_corner(self): - msg = "Not enough parameters to construct Period range" - with pytest.raises(ValueError, match=msg): - PeriodIndex(periods=10, freq="A") - - start = Period("2007", freq="A-JUN") - end = Period("2010", freq="A-DEC") - - msg = "start and end must have same freq" - with pytest.raises(ValueError, match=msg): - PeriodIndex(start=start, end=end) - - msg = ( - "Of the three parameters: start, end, and periods, exactly two" - " must be specified" - ) - with pytest.raises(ValueError, match=msg): - PeriodIndex(start=start) - with pytest.raises(ValueError, match=msg): - PeriodIndex(end=end) - result = period_range("2007-01", periods=10.5, freq="M") exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) @@ -230,7 +222,7 @@ def test_constructor_dtype(self): assert res.dtype == "period[M]" msg = "specified freq and dtype are different" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(["2011-01"], freq="M", dtype="period[D]") def test_constructor_empty(self): @@ -290,12 +282,12 @@ def test_constructor_pi_nat(self): def test_constructor_incompat_freq(self): msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] @@ -303,12 +295,12 @@ def test_constructor_incompat_freq(self): ) # first element is pd.NaT - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] @@ -368,27 +360,20 @@ def test_constructor_year_and_quarter(self): p = PeriodIndex(lops) tm.assert_index_equal(p, idx) - @pytest.mark.parametrize( - "func, warning", [(PeriodIndex, FutureWarning), (period_range, None)] - ) - def test_constructor_freq_mult(self, func, warning): + def test_constructor_freq_mult(self): # GH #7811 - with tm.assert_produces_warning(warning): - # must be the same, but for sure... - pidx = func(start="2014-01", freq="2M", periods=4) + pidx = period_range(start="2014-01", freq="2M", periods=4) expected = PeriodIndex(["2014-01", "2014-03", "2014-05", "2014-07"], freq="2M") tm.assert_index_equal(pidx, expected) - with tm.assert_produces_warning(warning): - pidx = func(start="2014-01-02", end="2014-01-15", freq="3D") + pidx = period_range(start="2014-01-02", end="2014-01-15", freq="3D") expected = PeriodIndex( ["2014-01-02", "2014-01-05", "2014-01-08", "2014-01-11", "2014-01-14"], freq="3D", ) tm.assert_index_equal(pidx, expected) - with tm.assert_produces_warning(warning): - pidx = func(end="2014-01-01 17:00", freq="4H", periods=3) + pidx = period_range(end="2014-01-01 17:00", freq="4H", periods=3) expected = PeriodIndex( ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" ) @@ -425,18 +410,6 @@ def test_constructor_freq_combined(self): expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") tm.assert_index_equal(pidx, expected) - def test_constructor_range_based_deprecated(self): - with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(freq="A", start="1/1/2001", end="12/1/2009") - assert len(pi) == 9 - - def test_constructor_range_based_deprecated_different_freq(self): - with tm.assert_produces_warning(FutureWarning) as m: - PeriodIndex(start="2000", periods=2) - - warning, = m - assert 'freq="A-DEC"' in str(warning.message) - def test_constructor(self): pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 @@ -507,21 +480,6 @@ def test_constructor(self): with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) - def test_constructor_error(self): - start = Period("02-Apr-2005", "B") - end_intv = Period("2006-12-31", ("w", 1)) - - msg = "start and end must have same freq" - with pytest.raises(ValueError, match=msg): - PeriodIndex(start=start, end=end_intv) - - msg = ( - "Of the three parameters: start, end, and periods, " - "exactly two must be specified" - ) - with pytest.raises(ValueError, match=msg): - PeriodIndex(start=start) - @pytest.mark.parametrize( "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] ) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 2a88b79f381c4..5db373a9f07ae 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import PeriodIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_to_native_types(): diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index c8f0d3b3fe553..7dbefbdaff98e 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Period, PeriodIndex, Series, notna, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestGetItem: @@ -235,6 +235,21 @@ def test_where_other(self): result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2) + def test_where_invalid_dtypes(self): + pi = period_range("20130101", periods=5, freq="D") + + i2 = pi.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + pi[2:].tolist(), freq="D") + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + pi.where(notna(i2), i2.to_timestamp("S")) + class TestTake: def test_take(self): @@ -550,6 +565,35 @@ def test_get_indexer(self): res = idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 day")) tm.assert_numpy_array_equal(res, np.array([0, 0, 1, -1], dtype=np.intp)) + def test_get_indexer_mismatched_dtype(self): + # Check that we return all -1s and do not raise or cast incorrectly + + dti = pd.date_range("2016-01-01", periods=3) + pi = dti.to_period("D") + pi2 = dti.to_period("W") + + expected = np.array([-1, -1, -1], dtype=np.intp) + + result = pi.get_indexer(dti) + tm.assert_numpy_array_equal(result, expected) + + # This should work in both directions + result = dti.get_indexer(pi) + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer(pi2) + tm.assert_numpy_array_equal(result, expected) + + # We expect the same from get_indexer_non_unique + result = pi.get_indexer_non_unique(dti)[0] + tm.assert_numpy_array_equal(result, expected) + + result = dti.get_indexer_non_unique(pi)[0] + tm.assert_numpy_array_equal(result, expected) + + result = pi.get_indexer_non_unique(pi2)[0] + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self): # GH 17717 p1 = pd.Period("2017-09-02") @@ -679,7 +723,7 @@ def test_indexing(self): def test_period_index_indexer(self): # GH4125 idx = pd.period_range("2002-01", "2003-12", freq="M") - df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) + df = pd.DataFrame(np.random.randn(24, 10), index=idx) tm.assert_frame_equal(df, df.loc[idx]) tm.assert_frame_equal(df, df.loc[list(idx)]) tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 96042f4dbaba2..427d9ab712320 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -3,9 +3,9 @@ import pandas as pd from pandas import DatetimeIndex, Index, NaT, PeriodIndex, Series +import pandas._testing as tm from pandas.core.arrays import PeriodArray -from pandas.tests.test_base import Ops -import pandas.util.testing as tm +from pandas.tests.base.test_ops import Ops class TestPeriodIndexOps(Ops): @@ -343,5 +343,5 @@ def test_freq_setter_deprecated(self): idx.freq # warning for setter - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 50a12baf352d9..9ca2dd169416f 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Period, Series, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndex: @@ -123,7 +123,7 @@ def test_range_slice_outofbounds(self): for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__([], freq="D"), columns=["units"]) + empty = DataFrame(index=type(idx)([], freq="D"), columns=["units"]) empty["units"] = empty["units"].astype("int64") tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index d75bd7bb21827..16fa0b0c25925 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -17,7 +17,7 @@ offsets, period_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike @@ -105,28 +105,9 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError, match=msg): DatetimeIndex([]).millisecond - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: difference of Period MUST preserve frequency - # but the ability to union results must be preserved - - index = period_range("20160920", "20160925", freq="D") - - other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq="D") - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = period_range("20160922", "20160925", freq="D") - idx_diff = index.difference(other, sort) - expected = PeriodIndex(["20160920", "20160921"], freq="D") - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_hash_error(self): index = period_range("20010101", periods=10) - msg = "unhashable type: '{}'".format(type(index).__name__) + msg = f"unhashable type: '{type(index).__name__}'" with pytest.raises(TypeError, match=msg): hash(index) @@ -156,17 +137,6 @@ def test_shallow_copy_changing_freq_raises(self): with pytest.raises(IncompatibleFrequency, match=msg): pi._shallow_copy(pi, freq="H") - def test_dtype_str(self): - pi = pd.PeriodIndex([], freq="M") - with tm.assert_produces_warning(FutureWarning): - assert pi.dtype_str == "period[M]" - assert pi.dtype_str == str(pi.dtype) - - with tm.assert_produces_warning(FutureWarning): - pi = pd.PeriodIndex([], freq="3M") - assert pi.dtype_str == "period[3M]" - assert pi.dtype_str == str(pi.dtype) - def test_view_asi8(self): idx = pd.PeriodIndex([], freq="M") @@ -191,8 +161,7 @@ def test_values(self): exp = np.array([], dtype=np.object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) - with tm.assert_produces_warning(FutureWarning): - tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) @@ -254,8 +223,8 @@ def test_period_index_length(self): i1 = period_range(start=start, end=end_intv) msg = ( - "Of the three parameters: start, end, and periods, exactly two" - " must be specified" + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" ) with pytest.raises(ValueError, match=msg): period_range(start=start) @@ -458,8 +427,8 @@ def test_contains_nat(self): def test_periods_number_check(self): msg = ( - "Of the three parameters: start, end, and periods, exactly two" - " must be specified" + "Of the three parameters: start, end, and periods, exactly two " + "must be specified" ) with pytest.raises(ValueError, match=msg): period_range("2011-1-1", "2012-1-1", "B") @@ -482,7 +451,7 @@ def test_index_duplicate_periods(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) - result = ts[2007] + result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 @@ -492,8 +461,8 @@ def test_index_duplicate_periods(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) - result = ts[2007] - expected = ts[idx == 2007] + result = ts["2007"] + expected = ts[idx == "2007"] tm.assert_series_equal(result, expected) def test_index_unique(self): @@ -540,15 +509,10 @@ def test_pindex_qaccess(self): assert s["05Q4"] == s[2] def test_pindex_multiples(self): - with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(start="1/1/11", end="12/31/11", freq="2M") expected = PeriodIndex( ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], freq="2M", ) - tm.assert_index_equal(pi, expected) - assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == "2M" pi = period_range(start="1/1/11", end="12/31/11", freq="2M") tm.assert_index_equal(pi, expected) diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index 828fab08daceb..2c3d22198df9f 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -1,7 +1,7 @@ import pytest from pandas import NaT, Period, PeriodIndex, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodRange: diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index 7956b9f26e6ef..d9809f0f75611 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -1,7 +1,7 @@ """Tests for PeriodIndex behaving like a vectorized Period scalar""" from pandas import Timedelta, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexOps: diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 03e4bd5834166..dc7805880784f 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -1,10 +1,11 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + import pandas as pd from pandas import Index, PeriodIndex, date_range, period_range -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): @@ -177,11 +178,11 @@ def test_union_misc(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.union(index2, sort=sort) index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.join(index3) def test_union_dataframe_index(self): @@ -213,11 +214,11 @@ def test_intersection(self, sort): # raise if different frequencies index = period_range("1/1/2000", "1/20/2000", freq="D") index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.intersection(index2, sort=sort) index3 = period_range("1/1/2000", "1/20/2000", freq="2D") - with pytest.raises(period.IncompatibleFrequency): + with pytest.raises(IncompatibleFrequency): index.intersection(index3, sort=sort) @pytest.mark.parametrize("sort", [None, False]) @@ -353,3 +354,22 @@ def test_difference(self, sort): if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq="D") + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = PeriodIndex(["20160920", "20160921"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_shift.py similarity index 95% rename from pandas/tests/indexes/period/test_arithmetic.py rename to pandas/tests/indexes/period/test_shift.py index 80e4b1fe1e430..5689e98c33455 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_shift.py @@ -3,10 +3,10 @@ import pandas as pd from pandas import PeriodIndex, period_range -import pandas.util.testing as tm +import pandas._testing as tm -class TestPeriodIndexArithmetic: +class TestPeriodIndexShift: # --------------------------------------------------------------- # PeriodIndex.shift is used by __add__ and __sub__ @@ -117,5 +117,3 @@ def test_shift_periods(self): idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - tm.assert_index_equal(idx.shift(n=0), idx) diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index a9c0ecd1a3041..28ab14af71362 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency from pandas._libs.tslibs.ccalendar import MONTHS import pandas as pd @@ -17,8 +18,7 @@ period_range, to_datetime, ) -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodRepresentation: @@ -231,14 +231,43 @@ def test_searchsorted(self, freq): p2 = pd.Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 - msg = "Input has different freq=H from PeriodIndex" - with pytest.raises(period.IncompatibleFrequency, match=msg): + assert pidx.searchsorted(pd.NaT) == 0 + + msg = "Input has different freq=H from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): pidx.searchsorted(pd.Period("2014-01-01", freq="H")) - msg = "Input has different freq=5D from PeriodIndex" - with pytest.raises(period.IncompatibleFrequency, match=msg): + msg = "Input has different freq=5D from PeriodArray" + with pytest.raises(IncompatibleFrequency, match=msg): pidx.searchsorted(pd.Period("2014-01-01", freq="5D")) + def test_searchsorted_invalid(self): + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq="D", + ) + + other = np.array([0, 1], dtype=np.int64) + + msg = "requires either a Period or PeriodArray" + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(other.astype("timedelta64[ns]")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64(4)) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.timedelta64("NaT", "ms")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64(4, "ns")) + + with pytest.raises(TypeError, match=msg): + pidx.searchsorted(np.datetime64("NaT", "ns")) + class TestPeriodIndexConversion: def test_tolist(self): diff --git a/pandas/tests/indexes/ranges/__init__.py b/pandas/tests/indexes/ranges/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/ranges/test_constructors.py b/pandas/tests/indexes/ranges/test_constructors.py new file mode 100644 index 0000000000000..ba1de6d551d6b --- /dev/null +++ b/pandas/tests/indexes/ranges/test_constructors.py @@ -0,0 +1,154 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import Index, RangeIndex, Series +import pandas._testing as tm + + +class TestRangeIndexConstructors: + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize( + "args, kwargs, start, stop, step", + [ + ((5,), dict(), 0, 5, 1), + ((1, 5), dict(), 1, 5, 1), + ((1, 5, 2), dict(), 1, 5, 2), + ((0,), dict(), 0, 0, 1), + ((0, 0), dict(), 0, 0, 1), + (tuple(), dict(start=0), 0, 0, 1), + (tuple(), dict(stop=0), 0, 0, 1), + ], + ) + def test_constructor(self, args, kwargs, start, stop, step, name): + result = RangeIndex(*args, name=name, **kwargs) + expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) + assert isinstance(result, RangeIndex) + assert result.name is name + assert result._range == range(start, stop, step) + tm.assert_index_equal(result, expected) + + def test_constructor_invalid_args(self): + msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" + with pytest.raises(TypeError, match=msg): + RangeIndex() + + with pytest.raises(TypeError, match=msg): + RangeIndex(name="Foo") + + # invalid args + for i in [ + Index(["a", "b"]), + Series(["a", "b"]), + np.array(["a", "b"]), + [], + "foo", + datetime(2000, 1, 1, 0, 0), + np.arange(0, 10), + np.array([1]), + [1], + ]: + with pytest.raises(TypeError): + RangeIndex(i) + + # we don't allow on a bare Index + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) + with pytest.raises(TypeError, match=msg): + Index(0, 1000) + + def test_constructor_same(self): + + # pass thru w and w/o copy + index = RangeIndex(1, 5, 2) + result = RangeIndex(index, copy=False) + assert result.identical(index) + + result = RangeIndex(index, copy=True) + tm.assert_index_equal(result, index, exact=True) + + result = RangeIndex(index) + tm.assert_index_equal(result, index, exact=True) + + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + RangeIndex(index, dtype="float64") + + def test_constructor_range(self): + + msg = "Value needs to be a scalar value, was type " + with pytest.raises(TypeError, match=msg): + result = RangeIndex(range(1, 5, 2)) + + result = RangeIndex.from_range(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex.from_range(range(5, 6)) + expected = RangeIndex(5, 6, 1) + tm.assert_index_equal(result, expected, exact=True) + + # an invalid range + result = RangeIndex.from_range(range(5, 1)) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex.from_range(range(5)) + expected = RangeIndex(0, 5, 1) + tm.assert_index_equal(result, expected, exact=True) + + result = Index(range(1, 5, 2)) + expected = RangeIndex(1, 5, 2) + tm.assert_index_equal(result, expected, exact=True) + + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + Index(range(1, 5, 2), dtype="float64") + msg = r"^from_range\(\) got an unexpected keyword argument" + with pytest.raises(TypeError, match=msg): + RangeIndex.from_range(range(10), copy=True) + + def test_constructor_name(self): + # GH#12288 + orig = RangeIndex(10) + orig.name = "original" + + copy = RangeIndex(orig) + copy.name = "copy" + + assert orig.name == "original" + assert copy.name == "copy" + + new = Index(copy) + assert new.name == "copy" + + new.name = "new" + assert orig.name == "original" + assert copy.name == "copy" + assert new.name == "new" + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = RangeIndex(1, 5) + assert index.values.dtype == np.int64 + tm.assert_index_equal(index, Index(arr)) + + # non-int raise Exception + with pytest.raises(TypeError): + RangeIndex("1", "10", "1") + with pytest.raises(TypeError): + RangeIndex(1.1, 10.2, 1.3) + + # invalid passed type + with pytest.raises( + ValueError, + match="Incorrect `dtype` passed: expected signed integer, received float64", + ): + RangeIndex(1, 5, dtype="float64") diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/ranges/test_range.py similarity index 68% rename from pandas/tests/indexes/test_range.py rename to pandas/tests/indexes/ranges/test_range.py index fa64e1bacb2e5..8d98ab18963b6 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -1,15 +1,13 @@ -from datetime import datetime, timedelta - import numpy as np import pytest from pandas.core.dtypes.common import ensure_platform_int import pandas as pd -from pandas import Float64Index, Index, Int64Index, RangeIndex, Series -import pandas.util.testing as tm +from pandas import Float64Index, Index, Int64Index, RangeIndex +import pandas._testing as tm -from .test_numeric import Numeric +from ..test_numeric import Numeric # aliases to make some tests easier to read RI = RangeIndex @@ -45,142 +43,6 @@ def test_too_many_names(self): with pytest.raises(ValueError, match="^Length"): index.names = ["roger", "harold"] - @pytest.mark.parametrize("name", [None, "foo"]) - @pytest.mark.parametrize( - "args, kwargs, start, stop, step", - [ - ((5,), dict(), 0, 5, 1), - ((1, 5), dict(), 1, 5, 1), - ((1, 5, 2), dict(), 1, 5, 2), - ((0,), dict(), 0, 0, 1), - ((0, 0), dict(), 0, 0, 1), - (tuple(), dict(start=0), 0, 0, 1), - (tuple(), dict(stop=0), 0, 0, 1), - ], - ) - def test_constructor(self, args, kwargs, start, stop, step, name): - result = RangeIndex(*args, name=name, **kwargs) - expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) - assert isinstance(result, RangeIndex) - assert result.name is name - assert result._range == range(start, stop, step) - tm.assert_index_equal(result, expected) - - def test_constructor_invalid_args(self): - msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" - with pytest.raises(TypeError, match=msg): - RangeIndex() - - with pytest.raises(TypeError, match=msg): - RangeIndex(name="Foo") - - # invalid args - for i in [ - Index(["a", "b"]), - Series(["a", "b"]), - np.array(["a", "b"]), - [], - "foo", - datetime(2000, 1, 1, 0, 0), - np.arange(0, 10), - np.array([1]), - [1], - ]: - with pytest.raises(TypeError): - RangeIndex(i) - - # we don't allow on a bare Index - msg = ( - r"Index\(\.\.\.\) must be called with a collection of some " - r"kind, 0 was passed" - ) - with pytest.raises(TypeError, match=msg): - Index(0, 1000) - - def test_constructor_same(self): - - # pass thru w and w/o copy - index = RangeIndex(1, 5, 2) - result = RangeIndex(index, copy=False) - assert result.identical(index) - - result = RangeIndex(index, copy=True) - tm.assert_index_equal(result, index, exact=True) - - result = RangeIndex(index) - tm.assert_index_equal(result, index, exact=True) - - with pytest.raises(TypeError): - RangeIndex(index, dtype="float64") - - def test_constructor_range(self): - - msg = "Value needs to be a scalar value, was type " - with pytest.raises(TypeError, match=msg): - result = RangeIndex(range(1, 5, 2)) - - result = RangeIndex.from_range(range(1, 5, 2)) - expected = RangeIndex(1, 5, 2) - tm.assert_index_equal(result, expected, exact=True) - - result = RangeIndex.from_range(range(5, 6)) - expected = RangeIndex(5, 6, 1) - tm.assert_index_equal(result, expected, exact=True) - - # an invalid range - result = RangeIndex.from_range(range(5, 1)) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected, exact=True) - - result = RangeIndex.from_range(range(5)) - expected = RangeIndex(0, 5, 1) - tm.assert_index_equal(result, expected, exact=True) - - result = Index(range(1, 5, 2)) - expected = RangeIndex(1, 5, 2) - tm.assert_index_equal(result, expected, exact=True) - - with pytest.raises(TypeError): - Index(range(1, 5, 2), dtype="float64") - msg = r"^from_range\(\) got an unexpected keyword argument" - with pytest.raises(TypeError, match=msg): - pd.RangeIndex.from_range(range(10), copy=True) - - def test_constructor_name(self): - # GH12288 - orig = RangeIndex(10) - orig.name = "original" - - copy = RangeIndex(orig) - copy.name = "copy" - - assert orig.name == "original" - assert copy.name == "copy" - - new = Index(copy) - assert new.name == "copy" - - new.name = "new" - assert orig.name == "original" - assert copy.name == "copy" - assert new.name == "new" - - def test_constructor_corner(self): - arr = np.array([1, 2, 3, 4], dtype=object) - index = RangeIndex(1, 5) - assert index.values.dtype == np.int64 - tm.assert_index_equal(index, Index(arr)) - - # non-int raise Exception - with pytest.raises(TypeError): - RangeIndex("1", "10", "1") - with pytest.raises(TypeError): - RangeIndex(1.1, 10.2, 1.3) - - # invalid passed type - with pytest.raises(TypeError): - RangeIndex(1, 5, dtype="float64") - @pytest.mark.parametrize( "index, start, stop, step", [ @@ -200,7 +62,7 @@ def test_start_stop_step_attrs(self, index, start, stop, step): def test_deprecated_start_stop_step_attrs(self, attr_name): # GH 26581 idx = self.create_index() - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(FutureWarning): getattr(idx, attr_name) def test_copy(self): @@ -297,14 +159,6 @@ def test_cached_data(self): 91 in idx assert idx._cached_data is None - with tm.assert_produces_warning(FutureWarning): - idx.contains(90) - assert idx._cached_data is None - - with tm.assert_produces_warning(FutureWarning): - idx.contains(91) - assert idx._cached_data is None - idx.all() assert idx._cached_data is None @@ -608,176 +462,6 @@ def test_join_self(self, join_type): joined = index.join(index, how=join_type) assert index is joined - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, sort): - # intersect with Int64Index - index = self.create_index() - other = Index(np.arange(1, 6)) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - expected = Index( - np.sort(np.asarray(np.intersect1d(index.values, other.values))) - ) - tm.assert_index_equal(result, expected) - - # intersect with increasing RangeIndex - other = RangeIndex(1, 6) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # intersect with decreasing RangeIndex - other = RangeIndex(5, 0, -1) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # reversed (GH 17296) - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # GH 17296: intersect two decreasing RangeIndexes - first = RangeIndex(10, -2, -2) - other = RangeIndex(5, -4, -1) - expected = first.astype(int).intersection(other.astype(int), sort=sort) - result = first.intersection(other, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - # reversed - result = other.intersection(first, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - index = RangeIndex(5) - - # intersect of non-overlapping indices - other = RangeIndex(5, 10, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - other = RangeIndex(-1, -5, -1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - # intersection of empty indices - other = RangeIndex(0, 0, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # intersection of non-overlapping values based on start value and gcd - index = RangeIndex(1, 10, 2) - other = RangeIndex(0, 10, 4) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [False, None]) - def test_union_noncomparable(self, sort): - # corner case, non-Int64Index - index = self.create_index() - other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) - result = index.union(other, sort=sort) - expected = Index(np.concatenate((index, other))) - tm.assert_index_equal(result, expected) - - result = other.union(index, sort=sort) - expected = Index(np.concatenate((other, index))) - tm.assert_index_equal(result, expected) - - @pytest.fixture( - params=[ - (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), - (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), I64(range(0, -20, -1))), - ( - RI(0, 10, 2), - RI(1, 10, 2), - RI(0, 10, 1), - I64(list(range(0, 10, 2)) + list(range(1, 10, 2))), - ), - ( - RI(0, 11, 2), - RI(1, 12, 2), - RI(0, 12, 1), - I64(list(range(0, 11, 2)) + list(range(1, 12, 2))), - ), - ( - RI(0, 21, 4), - RI(-2, 24, 4), - RI(-2, 24, 2), - I64(list(range(0, 21, 4)) + list(range(-2, 24, 4))), - ), - ( - RI(0, -20, -2), - RI(-1, -21, -2), - RI(-19, 1, 1), - I64(list(range(0, -20, -2)) + list(range(-1, -21, -2))), - ), - (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), - ( - RI(0, -100, -5), - RI(5, -100, -20), - RI(-95, 10, 5), - I64(list(range(0, -100, -5)) + [5]), - ), - ( - RI(0, -11, -1), - RI(1, -12, -4), - RI(-11, 2, 1), - I64(list(range(0, -11, -1)) + [1, -11]), - ), - (RI(0), RI(0), RI(0), RI(0)), - (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), - (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), I64(range(0, 102, 2))), - ( - RI(0, -100, -2), - RI(-100, 50, 102), - RI(-100, 4, 2), - I64(list(range(0, -100, -2)) + [-100, 2]), - ), - ( - RI(0, -100, -1), - RI(0, -50, -3), - RI(-99, 1, 1), - I64(list(range(0, -100, -1))), - ), - (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), - (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), - (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), - (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])), - ] - ) - def unions(self, request): - """Inputs and expected outputs for RangeIndex.union tests""" - - return request.param - - def test_union_sorted(self, unions): - - idx1, idx2, expected_sorted, expected_notsorted = unions - - res1 = idx1.union(idx2, sort=None) - tm.assert_index_equal(res1, expected_sorted, exact=True) - - res1 = idx1.union(idx2, sort=False) - tm.assert_index_equal(res1, expected_notsorted, exact=True) - - res2 = idx2.union(idx1, sort=None) - res3 = idx1._int64index.union(idx2, sort=None) - tm.assert_index_equal(res2, expected_sorted, exact=True) - tm.assert_index_equal(res3, expected_sorted) - def test_nbytes(self): # memory savings vs int index diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py new file mode 100644 index 0000000000000..5bedc4089feba --- /dev/null +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -0,0 +1,244 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas import Index, Int64Index, RangeIndex +import pandas._testing as tm + + +class TestRangeIndexSetOps: + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, sort): + # intersect with Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index(np.arange(1, 6)) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + # intersect with increasing RangeIndex + other = RangeIndex(1, 6) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # intersect with decreasing RangeIndex + other = RangeIndex(5, 0, -1) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # reversed (GH 17296) + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 17296: intersect two decreasing RangeIndexes + first = RangeIndex(10, -2, -2) + other = RangeIndex(5, -4, -1) + expected = first.astype(int).intersection(other.astype(int), sort=sort) + result = first.intersection(other, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + # reversed + result = other.intersection(first, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + index = RangeIndex(5) + + # intersect of non-overlapping indices + other = RangeIndex(5, 10, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + other = RangeIndex(-1, -5, -1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + # intersection of empty indices + other = RangeIndex(0, 0, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # intersection of non-overlapping values based on start value and gcd + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, None]) + def test_union_noncomparable(self, sort): + # corner case, non-Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other, sort=sort) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index, sort=sort) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + @pytest.fixture( + params=[ + ( + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(5, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(10, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(-10, -20, -1), + RangeIndex(-19, 1, 1), + Int64Index(range(0, -20, -1)), + ), + ( + RangeIndex(0, 10, 2), + RangeIndex(1, 10, 2), + RangeIndex(0, 10, 1), + Int64Index(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RangeIndex(0, 11, 2), + RangeIndex(1, 12, 2), + RangeIndex(0, 12, 1), + Int64Index(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RangeIndex(0, 21, 4), + RangeIndex(-2, 24, 4), + RangeIndex(-2, 24, 2), + Int64Index(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RangeIndex(0, -20, -2), + RangeIndex(-1, -21, -2), + RangeIndex(-19, 1, 1), + Int64Index(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + ( + RangeIndex(0, 100, 5), + RangeIndex(0, 100, 20), + RangeIndex(0, 100, 5), + Int64Index(range(0, 100, 5)), + ), + ( + RangeIndex(0, -100, -5), + RangeIndex(5, -100, -20), + RangeIndex(-95, 10, 5), + Int64Index(list(range(0, -100, -5)) + [5]), + ), + ( + RangeIndex(0, -11, -1), + RangeIndex(1, -12, -4), + RangeIndex(-11, 2, 1), + Int64Index(list(range(0, -11, -1)) + [1, -11]), + ), + (RangeIndex(0), RangeIndex(0), RangeIndex(0), RangeIndex(0)), + ( + RangeIndex(0, -10, -2), + RangeIndex(0), + RangeIndex(0, -10, -2), + RangeIndex(0, -10, -2), + ), + ( + RangeIndex(0, 100, 2), + RangeIndex(100, 150, 200), + RangeIndex(0, 102, 2), + Int64Index(range(0, 102, 2)), + ), + ( + RangeIndex(0, -100, -2), + RangeIndex(-100, 50, 102), + RangeIndex(-100, 4, 2), + Int64Index(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RangeIndex(0, -100, -1), + RangeIndex(0, -50, -3), + RangeIndex(-99, 1, 1), + Int64Index(list(range(0, -100, -1))), + ), + ( + RangeIndex(0, 1, 1), + RangeIndex(5, 6, 10), + RangeIndex(0, 6, 5), + Int64Index([0, 5]), + ), + ( + RangeIndex(0, 10, 5), + RangeIndex(-5, -6, -20), + RangeIndex(-5, 10, 5), + Int64Index([0, 5, -5]), + ), + ( + RangeIndex(0, 3, 1), + RangeIndex(4, 5, 1), + Int64Index([0, 1, 2, 4]), + Int64Index([0, 1, 2, 4]), + ), + ( + RangeIndex(0, 10, 1), + Int64Index([]), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + ), + ] + ) + def unions(self, request): + """Inputs and expected outputs for RangeIndex.union tests""" + + return request.param + + def test_union_sorted(self, unions): + + idx1, idx2, expected_sorted, expected_notsorted = unions + + res1 = idx1.union(idx2, sort=None) + tm.assert_index_equal(res1, expected_sorted, exact=True) + + res1 = idx1.union(idx2, sort=False) + tm.assert_index_equal(res1, expected_notsorted, exact=True) + + res2 = idx2.union(idx1, sort=None) + res3 = idx1._int64index.union(idx2, sort=None) + tm.assert_index_equal(res2, expected_sorted, exact=True) + tm.assert_index_equal(res3, expected_sorted) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8d0cb0edf51df..1047c457d6b82 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -11,8 +11,8 @@ import pandas._config.config as cf from pandas._libs.tslib import Timestamp -from pandas.compat import PY36 from pandas.compat.numpy import np_datetime64_compat +from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.dtypes.generic import ABCIndex @@ -33,16 +33,17 @@ isna, period_range, ) -from pandas.core.index import ( +import pandas._testing as tm +from pandas.core.algorithms import safe_sort +from pandas.core.indexes.api import ( + Index, + MultiIndex, _get_combined_index, ensure_index, ensure_index_from_sequences, ) -from pandas.core.indexes.api import Index, MultiIndex -from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base from pandas.tests.indexes.conftest import indices_dict -import pandas.util.testing as tm class TestIndex(Base): @@ -70,7 +71,9 @@ def test_can_hold_identifiers(self): @pytest.mark.parametrize("index", ["datetime"], indirect=True) def test_new_axis(self, index): - new_index = index[None, :] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + new_index = index[None, :] assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) @@ -100,6 +103,7 @@ def test_constructor_copy(self, index): arr[0] = "SOMEBIGLONGSTRING" assert new_index[0] != "SOMEBIGLONGSTRING" + # FIXME: dont leave commented-out # what to do here? # arr = np.array(5.) # pytest.raises(Exception, arr.view, Index) @@ -107,8 +111,8 @@ def test_constructor_copy(self, index): def test_constructor_corner(self): # corner case msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - " kind, 0 was passed" + r"Index\(\.\.\.\) must be called with a collection of some " + "kind, 0 was passed" ) with pytest.raises(TypeError, match=msg): Index(0) @@ -243,7 +247,7 @@ class ArrayLike: def __init__(self, array): self.array = array - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return self.array expected = pd.Index(array) @@ -355,6 +359,11 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) + def test_constructor_wrong_kwargs(self): + # GH #19348 + with pytest.raises(TypeError, match="Unexpected keyword arguments {'foo'}"): + Index([], foo="bar") + @pytest.mark.parametrize( "vals", [ @@ -453,9 +462,9 @@ def test_constructor_dtypes_to_timedelta(self, cast_index, vals): index = Index(vals) assert isinstance(index, TimedeltaIndex) - @pytest.mark.parametrize("attr, utc", [["values", False], ["asi8", True]]) + @pytest.mark.parametrize("attr", ["values", "asi8"]) @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) - def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, klass): + def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, klass): # Test constructing with a datetimetz dtype # .values produces numpy datetimes, so these are considered naive # .asi8 produces integers, so these are considered epoch timestamps @@ -466,30 +475,27 @@ def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, klass): index = index.tz_localize(tz_naive_fixture) dtype = index.dtype - if ( - tz_naive_fixture - and attr == "asi8" - and str(tz_naive_fixture) not in ("UTC", "tzutc()", "UTC+00:00") - ): - ex_warn = FutureWarning + if attr == "asi8": + result = pd.DatetimeIndex(arg).tz_localize(tz_naive_fixture) else: - ex_warn = None - - # stacklevel is checked elsewhere. We don't do it here since - # Index will have an frame, throwing off the expected. - with tm.assert_produces_warning(ex_warn, check_stacklevel=False): result = klass(arg, tz=tz_naive_fixture) tm.assert_index_equal(result, index) - with tm.assert_produces_warning(ex_warn, check_stacklevel=False): + if attr == "asi8": + result = pd.DatetimeIndex(arg).astype(dtype) + else: result = klass(arg, dtype=dtype) tm.assert_index_equal(result, index) - with tm.assert_produces_warning(ex_warn, check_stacklevel=False): + if attr == "asi8": + result = pd.DatetimeIndex(list(arg)).tz_localize(tz_naive_fixture) + else: result = klass(list(arg), tz=tz_naive_fixture) tm.assert_index_equal(result, index) - with tm.assert_produces_warning(ex_warn, check_stacklevel=False): + if attr == "asi8": + result = pd.DatetimeIndex(list(arg)).astype(dtype) + else: result = klass(list(arg), dtype=dtype) tm.assert_index_equal(result, index) @@ -507,7 +513,7 @@ def test_constructor_dtypes_timedelta(self, attr, klass): result = klass(list(values), dtype=dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) + @pytest.mark.parametrize("value", [[], iter([]), (_ for _ in [])]) @pytest.mark.parametrize( "klass", [ @@ -530,7 +536,7 @@ def test_constructor_empty(self, value, klass): [ (PeriodIndex([], freq="B"), PeriodIndex), (PeriodIndex(iter([]), freq="B"), PeriodIndex), - (PeriodIndex((x for x in []), freq="B"), PeriodIndex), + (PeriodIndex((_ for _ in []), freq="B"), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ["blue", "red"]], codes=[[], []]), MultiIndex), ], @@ -730,7 +736,7 @@ def test_nanosecond_index_access(self): assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self, index): - bool_index = np.repeat(True, len(index)).astype(bool) + bool_index = np.ones(len(index), dtype=bool) bool_index[5:30:2] = False sub_index = index[bool_index] @@ -752,7 +758,7 @@ def test_fancy(self): @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) - empty_index = index.__class__([]) + empty_index = type(index)([]) assert index[[]].identical(empty_index) assert index[empty_arr].identical(empty_index) @@ -762,7 +768,7 @@ def test_empty_fancy_raises(self, index): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. empty_farr = np.array([], dtype=np.float_) - empty_index = index.__class__([]) + empty_index = type(index)([]) assert index[[]].identical(empty_index) # np.ndarray only accepts ndarray of int & bool dtypes, so should Index @@ -1385,13 +1391,6 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - # GH18217 - def test_summary_deprecated(self): - ind = Index(["{other}%s", "~:{range}:0"], name="A") - - with tm.assert_produces_warning(FutureWarning): - ind.summary() - def test_format(self, indices): self._check_method_works(Index.format, indices) @@ -1616,11 +1615,7 @@ def test_get_loc(self, method): def test_get_loc_raises_bad_label(self, method): index = pd.Index([0, 1, 2]) if method: - # Messages vary across versions - if PY36: - msg = "not supported between" - else: - msg = "unorderable types" + msg = "not supported between" else: msg = "invalid key" @@ -1737,22 +1732,22 @@ def test_slice_locs_na_raises(self): "in_slice,expected", [ (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), - (pd.IndexSlice["b"::-1], "b"), - (pd.IndexSlice[:"b":-1], "yxdcb"), - (pd.IndexSlice[:"y":-1], "y"), - (pd.IndexSlice["y"::-1], "yxdcb"), - (pd.IndexSlice["y"::-4], "yb"), + (pd.IndexSlice["b":"y":-1], ""), # type: ignore + (pd.IndexSlice["b"::-1], "b"), # type: ignore + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore + (pd.IndexSlice[:"y":-1], "y"), # type: ignore + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore + (pd.IndexSlice["y"::-4], "yb"), # type: ignore # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), - (pd.IndexSlice[:"a":-2], "ydb"), - (pd.IndexSlice["z"::-1], "yxdcb"), - (pd.IndexSlice["z"::-3], "yc"), - (pd.IndexSlice["m"::-1], "dcb"), - (pd.IndexSlice[:"m":-1], "yx"), - (pd.IndexSlice["a":"a":-1], ""), - (pd.IndexSlice["z":"z":-1], ""), - (pd.IndexSlice["m":"m":-1], ""), + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore + (pd.IndexSlice["z"::-3], "yc"), # type: ignore + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore + (pd.IndexSlice["a":"a":-1], ""), # type: ignore + (pd.IndexSlice["z":"z":-1], ""), # type: ignore + (pd.IndexSlice["m":"m":-1], ""), # type: ignore ], ) def test_slice_locs_negative_step(self, in_slice, expected): @@ -1837,7 +1832,7 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) - msg = r"\"\[{}\] not found in axis\"".format(re.escape(to_drop[1].__repr__())) + msg = fr"\"\[{re.escape(to_drop[1].__repr__())}\] not found in axis\"" for drop_me in to_drop[1], [to_drop[1]]: with pytest.raises(KeyError, match=msg): removed.drop(drop_me) @@ -2005,14 +2000,14 @@ def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): index = index.rename(["foo", "bar"]) - msg = "'Level {} not found'" + msg = f"'Level {label} not found'" else: index = index.rename("foo") - msg = r"Requested level \({}\) does not match index name \(foo\)" - with pytest.raises(KeyError, match=msg.format(label)): + msg = fr"Requested level \({label}\) does not match index name \(foo\)" + with pytest.raises(KeyError, match=msg): index.isin([], level=label) - @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) def test_isin_empty(self, empty): # see gh-16991 index = Index(["a", "b"]) @@ -2405,27 +2400,25 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - def test_get_duplicates_deprecated(self): - index = pd.Index([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - index.get_duplicates() - - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) - def test_deprecated_contains(self, indices): - # deprecated for all types except IntervalIndex - warning = FutureWarning if not isinstance(indices, pd.IntervalIndex) else None - with tm.assert_produces_warning(warning): + def test_contains_method_removed(self, indices): + # GH#30103 method removed for all types except IntervalIndex + if isinstance(indices, pd.IntervalIndex): indices.contains(1) + else: + with pytest.raises(AttributeError): + indices.contains(1) class TestMixedIntIndex(Base): @@ -2444,29 +2437,21 @@ def create_index(self): def test_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - index.argsort() - else: - with pytest.raises(TypeError, match="unorderable types"): - index.argsort() + with pytest.raises(TypeError, match="'>|<' not supported"): + index.argsort() def test_numpy_argsort(self): index = self.create_index() - if PY36: - with pytest.raises(TypeError, match="'>|<' not supported"): - np.argsort(index) - else: - with pytest.raises(TypeError, match="unorderable types"): - np.argsort(index) + with pytest.raises(TypeError, match="'>|<' not supported"): + np.argsort(index) def test_copy_name(self): # Check that "name" argument passed at initialization is honoured # GH12309 index = self.create_index() - first = index.__class__(index, copy=True, name="mario") - second = first.__class__(first, copy=False) + first = type(index)(index, copy=True, name="mario") + second = type(first)(first, copy=False) # Even though "copy=False", we want a new object. assert first is not second @@ -2768,7 +2753,7 @@ def test_generated_op_names(opname, indices): # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return - opname = "__{name}__".format(name=opname) + opname = f"__{opname}__" method = getattr(indices, opname) assert method.__name__ == opname @@ -2781,32 +2766,18 @@ def test_index_subclass_constructor_wrong_kwargs(index_maker): def test_deprecated_fastpath(): + msg = "[Uu]nexpected keyword argument" + with pytest.raises(TypeError, match=msg): + pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) - with tm.assert_produces_warning(FutureWarning): - idx = pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) - - expected = pd.Index(["a", "b"], name="test") - tm.assert_index_equal(idx, expected) - - with tm.assert_produces_warning(FutureWarning): - idx = pd.Int64Index( - np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True - ) - - expected = pd.Index([1, 2, 3], name="test", dtype="int64") - tm.assert_index_equal(idx, expected) - - with tm.assert_produces_warning(FutureWarning): - idx = pd.RangeIndex(0, 5, 2, name="test", fastpath=True) + with pytest.raises(TypeError, match=msg): + pd.Int64Index(np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True) - expected = pd.RangeIndex(0, 5, 2, name="test") - tm.assert_index_equal(idx, expected) + with pytest.raises(TypeError, match=msg): + pd.RangeIndex(0, 5, 2, name="test", fastpath=True) - with tm.assert_produces_warning(FutureWarning): - idx = pd.CategoricalIndex(["a", "b", "c"], name="test", fastpath=True) - - expected = pd.CategoricalIndex(["a", "b", "c"], name="test") - tm.assert_index_equal(idx, expected) + with pytest.raises(TypeError, match=msg): + pd.CategoricalIndex(["a", "b", "c"], name="test", fastpath=True) def test_shape_of_invalid_index(): @@ -2815,9 +2786,35 @@ def test_shape_of_invalid_index(): # about this). However, as long as this is not solved in general,this test ensures # that the returned shape is consistent with this underlying array for # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) - a = np.arange(8).reshape(2, 2, 2) - idx = pd.Index(a) - assert idx.shape == a.shape - idx = pd.Index([0, 1, 2, 3]) - assert idx[:, None].shape == (4, 1) + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + assert idx[:, None].shape == (4, 1) + + +def test_validate_1d_input(): + # GH#27125 check that we do not have >1-dimensional input + msg = "Index data must be 1-dimensional" + + arr = np.arange(8).reshape(2, 2, 2) + with pytest.raises(ValueError, match=msg): + pd.Index(arr) + + with pytest.raises(ValueError, match=msg): + pd.Float64Index(arr.astype(np.float64)) + + with pytest.raises(ValueError, match=msg): + pd.Int64Index(arr.astype(np.int64)) + + with pytest.raises(ValueError, match=msg): + pd.UInt64Index(arr.astype(np.uint64)) + + df = pd.DataFrame(arr.reshape(4, 2)) + with pytest.raises(ValueError, match=msg): + pd.Index(df) + + # GH#13601 trying to assign a multi-dimensional array to an index is not + # allowed + ser = pd.Series(0, range(4)) + with pytest.raises(ValueError, match=msg): + ser.index = np.array([[2, 3]] * 4) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index ae1a21e9b3980..7e30233353553 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import CategoricalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestCommon: @@ -158,16 +158,10 @@ def test_set_name_methods(self, indices): assert indices.name == name assert indices.names == [name] - def test_dtype_str(self, indices): - with tm.assert_produces_warning(FutureWarning): - dtype = indices.dtype_str - assert isinstance(dtype, str) - assert dtype == str(indices.dtype) - def test_hash_error(self, indices): index = indices with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=f"unhashable type: '{type(index).__name__}'" ): hash(indices) @@ -201,8 +195,9 @@ def test_unique(self, indices): with pytest.raises(IndexError, match=msg): indices.unique(level=3) - msg = r"Requested level \(wrong\) does not match index name \({}\)".format( - re.escape(indices.name.__repr__()) + msg = ( + fr"Requested level \(wrong\) does not match index name " + fr"\({re.escape(indices.name.__repr__())}\)" ) with pytest.raises(KeyError, match=msg): indices.unique(level="wrong") diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 712feb7b8ef61..2e53e29c3fab1 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -1,21 +1,68 @@ -import warnings +import re -import numpy as np import pytest -from pandas.core.indexes.frozen import FrozenList, FrozenNDArray -from pandas.tests.test_base import CheckImmutable, CheckStringMixin -import pandas.util.testing as tm +from pandas.core.indexes.frozen import FrozenList -class TestFrozenList(CheckImmutable, CheckStringMixin): - mutable_methods = ("extend", "pop", "remove", "insert") +class TestFrozenList: + unicode_container = FrozenList(["\u05d0", "\u05d1", "c"]) def setup_method(self, _): self.lst = [1, 2, 3, 4, 5] self.container = FrozenList(self.lst) - self.klass = FrozenList + + def check_mutable_error(self, *args, **kwargs): + # Pass whatever function you normally would to pytest.raises + # (after the Exception kind). + mutable_regex = re.compile("does not support mutable operations") + with pytest.raises(TypeError): + mutable_regex(*args, **kwargs) + + def test_no_mutable_funcs(self): + def setitem(): + self.container[0] = 5 + + self.check_mutable_error(setitem) + + def setslice(): + self.container[1:2] = 3 + + self.check_mutable_error(setslice) + + def delitem(): + del self.container[0] + + self.check_mutable_error(delitem) + + def delslice(): + del self.container[0:3] + + self.check_mutable_error(delslice) + + mutable_methods = ("extend", "pop", "remove", "insert") + + for meth in mutable_methods: + self.check_mutable_error(getattr(self.container, meth)) + + def test_slicing_maintains_type(self): + result = self.container[1:2] + expected = self.lst[1:2] + self.check_result(result, expected) + + def check_result(self, result, expected): + assert isinstance(result, FrozenList) + assert result == expected + + def test_string_methods_dont_fail(self): + repr(self.container) + str(self.container) + bytes(self.container) + + def test_tricky_container(self): + repr(self.unicode_container) + str(self.unicode_container) def test_add(self): result = self.container + (1, 2, 3) @@ -55,62 +102,3 @@ def test_tricky_container_to_bytes_raises(self): msg = "^'str' object cannot be interpreted as an integer$" with pytest.raises(TypeError, match=msg): bytes(self.unicode_container) - - -class TestFrozenNDArray(CheckImmutable, CheckStringMixin): - mutable_methods = ("put", "itemset", "fill") - - def setup_method(self, _): - self.lst = [3, 5, 7, -2] - self.klass = FrozenNDArray - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - - self.container = FrozenNDArray(self.lst) - self.unicode_container = FrozenNDArray(["\u05d0", "\u05d1", "c"]) - - def test_constructor_warns(self): - # see gh-9031 - with tm.assert_produces_warning(FutureWarning): - FrozenNDArray([1, 2, 3]) - - def test_tricky_container_to_bytes(self): - bytes(self.unicode_container) - - def test_shallow_copying(self): - original = self.container.copy() - assert isinstance(self.container.view(), FrozenNDArray) - assert not isinstance(self.container.view(np.ndarray), FrozenNDArray) - assert self.container.view() is not self.container - tm.assert_numpy_array_equal(self.container, original) - - # Shallow copy should be the same too - assert isinstance(self.container._shallow_copy(), FrozenNDArray) - - # setting should not be allowed - def testit(container): - container[0] = 16 - - self.check_mutable_error(testit, self.container) - - def test_values(self): - original = self.container.view(np.ndarray).copy() - n = original[0] + 15 - - vals = self.container.values() - tm.assert_numpy_array_equal(original, vals) - - assert original is not vals - vals[0] = n - - assert isinstance(self.container, FrozenNDArray) - tm.assert_numpy_array_equal(self.container.values(), original) - assert vals[0] == n - - def test_searchsorted(self): - expected = 2 - assert self.container.searchsorted(7) == expected - - with tm.assert_produces_warning(FutureWarning): - assert self.container.searchsorted(v=7) == expected diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e424b3601a4b2..f025168643ab9 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, Series, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class Numeric(Base): @@ -167,6 +167,23 @@ def test_constructor(self): result = Index(np.array([np.nan])) assert pd.isna(result.values).all() + @pytest.mark.parametrize( + "index, dtype", + [ + (pd.Int64Index, "float64"), + (pd.UInt64Index, "categorical"), + (pd.Float64Index, "datetime64"), + (pd.RangeIndex, "float64"), + ], + ) + def test_invalid_dtype(self, index, dtype): + # GH 29539 + with pytest.raises( + ValueError, + match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}", + ): + index([1, 2, 3], dtype=dtype) + def test_constructor_invalid(self): # invalid @@ -177,8 +194,8 @@ def test_constructor_invalid(self): with pytest.raises(TypeError, match=msg): Float64Index(0.0) msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) with pytest.raises(TypeError, match=msg): Float64Index(["a", "b", 0.0]) @@ -245,9 +262,9 @@ def test_astype(self, mixed_index, float_index): # invalid for dtype in ["M8[ns]", "m8[ns]"]: msg = ( - "Cannot convert Float64Index to dtype {}; integer values" - " are required for conversion" - ).format(pandas_dtype(dtype)) + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) @@ -553,8 +570,8 @@ def test_union_noncomparable(self): def test_cant_or_shouldnt_cast(self): msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) # can't data = ["foo", "bar", "baz"] @@ -588,7 +605,7 @@ def test_take_fill_value(self): tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = "Unable to fill values because {name} cannot contain NA".format(name=name) + msg = f"Unable to fill values because {name} cannot contain NA" # fill_value=True with pytest.raises(ValueError, match=msg): @@ -638,8 +655,8 @@ def test_constructor(self): # scalar raise Exception msg = ( - r"Int64Index\(\.\.\.\) must be called with a collection of some" - " kind, 5 was passed" + r"Int64Index\(\.\.\.\) must be called with a collection of some " + "kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): Int64Index(5) @@ -719,6 +736,12 @@ def test_get_indexer(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + def test_get_indexer_nan(self): + # GH 7820 + result = Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_intersection(self): index = self.create_index() other = Index([1, 2, 3, 4, 5]) @@ -944,6 +967,11 @@ def test_constructor(self): res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) + # https://github.com/pandas-dev/pandas/issues/29526 + idx = UInt64Index([1, 2 ** 63 + 1], dtype=np.uint64) + res = Index([1, 2 ** 63 + 1], dtype=np.uint64) + tm.assert_index_equal(res, idx) + def test_get_indexer(self, index_large): target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) indexer = index_large.get_indexer(target) @@ -1187,3 +1215,29 @@ def test_range_float_union_dtype(): result = other.union(index) tm.assert_index_equal(result, expected) + + +def test_uint_index_does_not_convert_to_float64(): + # https://github.com/pandas-dev/pandas/issues/28279 + # https://github.com/pandas-dev/pandas/issues/28023 + series = pd.Series( + [0, 1, 2, 3, 4, 5], + index=[ + 7606741985629028552, + 17876870360202815256, + 17876870360202815256, + 13106359306506049338, + 8991270399732411471, + 8991270399732411472, + ], + ) + + result = series.loc[[7606741985629028552, 17876870360202815256]] + + expected = UInt64Index( + [7606741985629028552, 17876870360202815256, 17876870360202815256], + dtype="uint64", + ) + tm.assert_index_equal(result.index, expected) + + tm.assert_equal(result, series[:3]) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 6626ccf4a29f8..583556656ac87 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -6,12 +6,14 @@ Float64Index, Index, Int64Index, + PeriodIndex, TimedeltaIndex, UInt64Index, _np_version_under1p17, + _np_version_under1p18, ) +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -80,18 +82,22 @@ def test_numpy_ufuncs_other(indices, func): idx = indices if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): - # ok under numpy >= 1.17 - if not _np_version_under1p17 and func in [np.isfinite]: + if not _np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: + # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 + result = func(idx) + assert isinstance(result, np.ndarray) + + elif not _np_version_under1p17 and func in [np.isfinite]: + # ok under numpy >= 1.17 # Results in bool array result = func(idx) assert isinstance(result, np.ndarray) - assert not isinstance(result, Index) else: # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): func(idx) - elif isinstance(idx, DatetimeIndexOpsMixin): + elif isinstance(idx, PeriodIndex): # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): func(idx) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index d5b23653e8a72..abfa413d56655 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -2,7 +2,6 @@ The tests in this package are to ensure the proper resultant dtypes of set operations. """ -from collections import OrderedDict import itertools as it import numpy as np @@ -12,18 +11,16 @@ import pandas as pd from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.conftest import indices_dict -import pandas.util.testing as tm -COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict( - [ - ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), - ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), - ] -) +COMPATIBLE_INCONSISTENT_PAIRS = { + (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), + (Float64Index, Int64Index): (tm.makeFloatIndex, tm.makeIntIndex), + (Float64Index, RangeIndex): (tm.makeFloatIndex, tm.makeIntIndex), + (Float64Index, UInt64Index): (tm.makeFloatIndex, tm.makeUIntIndex), +} @pytest.fixture(params=it.combinations(indices_dict, 2), ids="-".join) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py deleted file mode 100644 index 4544657f79af7..0000000000000 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ /dev/null @@ -1,292 +0,0 @@ -from datetime import timedelta - -import numpy as np -import pytest - -from pandas.errors import NullFrequencyError - -import pandas as pd -from pandas import Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm - - -@pytest.fixture( - params=[ - pd.offsets.Hour(2), - timedelta(hours=2), - np.timedelta64(2, "h"), - Timedelta(hours=2), - ], - ids=str, -) -def delta(request): - # Several ways of representing two hours - return request.param - - -@pytest.fixture(params=["B", "D"]) -def freq(request): - return request.param - - -class TestTimedeltaIndexArithmetic: - # Addition and Subtraction Operations - - # ------------------------------------------------------------- - # TimedeltaIndex.shift is used by __add__/__sub__ - - def test_tdi_shift_empty(self): - # GH#9903 - idx = pd.TimedeltaIndex([], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - tm.assert_index_equal(idx.shift(3, freq="H"), idx) - - def test_tdi_shift_hours(self): - # GH#9903 - idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="H"), idx) - exp = pd.TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="H"), exp) - exp = pd.TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="H"), exp) - - def test_tdi_shift_minutes(self): - # GH#9903 - idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") - tm.assert_index_equal(idx.shift(0, freq="T"), idx) - exp = pd.TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") - tm.assert_index_equal(idx.shift(3, freq="T"), exp) - exp = pd.TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") - tm.assert_index_equal(idx.shift(-3, freq="T"), exp) - - def test_tdi_shift_int(self): - # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) - result = trange.shift(1) - expected = TimedeltaIndex( - [ - "1 days 01:00:00", - "2 days 01:00:00", - "3 days 01:00:00", - "4 days 01:00:00", - "5 days 01:00:00", - ], - freq="D", - ) - tm.assert_index_equal(result, expected) - - def test_tdi_shift_nonstandard_freq(self): - # GH#8083 - trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) - result = trange.shift(3, freq="2D 1s") - expected = TimedeltaIndex( - [ - "6 days 01:00:03", - "7 days 01:00:03", - "8 days 01:00:03", - "9 days 01:00:03", - "10 days 01:00:03", - ], - freq="D", - ) - tm.assert_index_equal(result, expected) - - def test_shift_no_freq(self): - # GH#19147 - tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) - with pytest.raises(NullFrequencyError): - tdi.shift(2) - - # ------------------------------------------------------------- - # Binary operations TimedeltaIndex and integer - - def test_tdi_add_int(self, one): - # Variants of `one` for #19012 - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng + one - expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) - tm.assert_index_equal(result, expected) - - def test_tdi_iadd_int(self, one): - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) - expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - rng += one - tm.assert_index_equal(rng, expected) - - def test_tdi_sub_int(self, one): - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng - one - expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) - tm.assert_index_equal(result, expected) - - def test_tdi_isub_int(self, one): - rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) - expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - rng -= one - tm.assert_index_equal(rng, expected) - - # ------------------------------------------------------------- - # __add__/__sub__ with integer arrays - - @pytest.mark.parametrize("box", [np.array, pd.Index]) - def test_tdi_add_integer_array(self, box): - # GH#19959 - rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) - other = box([4, 3, 2]) - expected = TimedeltaIndex(["1 day 13:00:00"] * 3) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng + other - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = other + rng - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("box", [np.array, pd.Index]) - def test_tdi_sub_integer_array(self, box): - # GH#19959 - rng = timedelta_range("9H", freq="H", periods=3) - other = box([4, 3, 2]) - expected = TimedeltaIndex(["5H", "7H", "9H"]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = rng - other - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # GH#22535 - result = other - rng - tm.assert_index_equal(result, -expected) - - @pytest.mark.parametrize("box", [np.array, pd.Index]) - def test_tdi_addsub_integer_array_no_freq(self, box): - # GH#19959 - tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) - other = box([14, -1, 16]) - with pytest.raises(NullFrequencyError): - tdi + other - with pytest.raises(NullFrequencyError): - other + tdi - with pytest.raises(NullFrequencyError): - tdi - other - with pytest.raises(NullFrequencyError): - other - tdi - - # ------------------------------------------------------------- - # Binary operations TimedeltaIndex and timedelta-like - # Note: add and sub are tested in tests.test_arithmetic, in-place - # tests are kept here because their behavior is Index-specific - - def test_tdi_iadd_timedeltalike(self, delta): - # only test adding/sub offsets as + is now numeric - rng = timedelta_range("1 days", "10 days") - expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") - rng += delta - tm.assert_index_equal(rng, expected) - - def test_tdi_isub_timedeltalike(self, delta): - # only test adding/sub offsets as - is now numeric - rng = timedelta_range("1 days", "10 days") - expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") - rng -= delta - tm.assert_index_equal(rng, expected) - - # ------------------------------------------------------------- - - # TODO: after #24365 this probably belongs in scalar tests - def test_ops_ndarray(self): - td = Timedelta("1 day") - - # timedelta, timedelta - other = pd.to_timedelta(["1 day"]).values - expected = pd.to_timedelta(["2 days"]).values - tm.assert_numpy_array_equal(td + other, expected) - tm.assert_numpy_array_equal(other + td, expected) - msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" - with pytest.raises(TypeError, match=msg): - td + np.array([1]) - msg = r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'" - with pytest.raises(TypeError, match=msg): - np.array([1]) + td - - expected = pd.to_timedelta(["0 days"]).values - tm.assert_numpy_array_equal(td - other, expected) - tm.assert_numpy_array_equal(-other + td, expected) - msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" - with pytest.raises(TypeError, match=msg): - td - np.array([1]) - msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timedelta'" - with pytest.raises(TypeError, match=msg): - np.array([1]) - td - - expected = pd.to_timedelta(["2 days"]).values - tm.assert_numpy_array_equal(td * np.array([2]), expected) - tm.assert_numpy_array_equal(np.array([2]) * td, expected) - msg = ( - "ufunc '?multiply'? cannot use operands with types" - r" dtype\(' is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - idx.freq = "2B" + idx._data.freq = "2B" # setting with non-freq string with pytest.raises(ValueError, match="Invalid frequency"): - idx.freq = "foo" + idx._data.freq = "foo" diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 4448b5e39684b..29e2c7dd20be0 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timedelta, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSlicing: diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 38f1d2c7d4a1b..44f4a2adedaad 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestVectorizedTimedelta: diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index 861067480b5fa..0aa784cbb7710 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Int64Index, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Hour @@ -22,6 +22,22 @@ def test_union(self): i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" + def test_union_sort_false(self): + tdi = timedelta_range("1day", periods=5) + + left = tdi[3:] + right = tdi[:3] + + # Check that we are testing the desired code path + assert left._can_fast_union(right) + + result = left.union(right) + tm.assert_index_equal(result, tdi) + + result = left.union(right, sort=False) + expected = pd.TimedeltaIndex(["4 Days", "5 Days", "1 Days", "2 Day", "3 Days"]) + tm.assert_index_equal(result, expected) + def test_union_coverage(self): idx = TimedeltaIndex(["3d", "1d", "2d"]) @@ -39,7 +55,7 @@ def test_union_bug_1730(self): rng_b = timedelta_range("1 day", periods=4, freq="4H") result = rng_a.union(rng_b) - exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) + exp = TimedeltaIndex(sorted(set(rng_a) | set(rng_b))) tm.assert_index_equal(result, exp) def test_union_bug_1745(self): @@ -50,7 +66,7 @@ def test_union_bug_1745(self): ) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) def test_union_bug_4564(self): @@ -59,9 +75,24 @@ def test_union_bug_4564(self): right = left + pd.offsets.Minute(15) result = left.union(right) - exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) + exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) + def test_union_freq_infer(self): + # When taking the union of two TimedeltaIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # DatetimeIndex behavior. + tdi = pd.timedelta_range("1 Day", periods=5) + left = tdi[[0, 1, 3, 4]] + right = tdi[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, tdi) + assert result.freq == "D" + def test_intersection_bug_1708(self): index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(5) @@ -179,3 +210,51 @@ def test_intersection_non_monotonic(self, rng, expected, sort): assert isinstance(result.freq, Hour) else: assert result.freq is None + + +class TestTimedeltaIndexDifference: + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, sort): + + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_shift.py b/pandas/tests/indexes/timedeltas/test_shift.py new file mode 100644 index 0000000000000..98933ff0423ab --- /dev/null +++ b/pandas/tests/indexes/timedeltas/test_shift.py @@ -0,0 +1,75 @@ +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import TimedeltaIndex +import pandas._testing as tm + + +class TestTimedeltaIndexShift: + + # ------------------------------------------------------------- + # TimedeltaIndex.shift is used by __add__/__sub__ + + def test_tdi_shift_empty(self): + # GH#9903 + idx = pd.TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) + + def test_tdi_shift_hours(self): + # GH#9903 + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) + + def test_tdi_shift_minutes(self): + # GH#9903 + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="T"), idx) + exp = pd.TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="T"), exp) + exp = pd.TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="T"), exp) + + def test_tdi_shift_int(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(1) + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_tdi_shift_nonstandard_freq(self): + # GH#8083 + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) + tm.assert_index_equal(result, expected) + + def test_shift_no_freq(self): + # GH#19147 + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) + with pytest.raises(NullFrequencyError): + tdi.shift(2) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index ba0af7dd8136c..3b52b93fa6369 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -1,5 +1,4 @@ from datetime import timedelta -import re import numpy as np import pytest @@ -15,7 +14,7 @@ date_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike @@ -58,52 +57,6 @@ def test_fillna_timedelta(self): ) tm.assert_index_equal(idx.fillna("x"), exp) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: Difference of TimedeltaIndex should not preserve frequency - - index = timedelta_range("0 days", "5 days", freq="D") - - other = timedelta_range("1 days", "4 days", freq="D") - expected = TimedeltaIndex(["0 days", "5 days"], freq=None) - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, sort): - - index = pd.TimedeltaIndex( - ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] - ) - - other = timedelta_range("1 days", "4 days", freq="D") - idx_diff = index.difference(other, sort) - - expected = TimedeltaIndex(["5 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["1 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_isin(self): index = tm.makeTimedeltaIndex(4) @@ -179,16 +132,6 @@ def test_sort_values(self): tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) - def test_get_duplicates(self): - idx = TimedeltaIndex(["1 day", "2 day", "2 day", "3 day", "3day", "4day"]) - - with tm.assert_produces_warning(FutureWarning): - # Deprecated - see GH20239 - result = idx.get_duplicates() - - ex = TimedeltaIndex(["2 day", "3day"]) - tm.assert_index_equal(result, ex) - def test_argmin_argmax(self): idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) assert idx.argmin() == 1 @@ -234,7 +177,7 @@ def test_pickle(self): def test_hash_error(self): index = timedelta_range("1 days", periods=10) with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=(f"unhashable type: {repr(type(index).__name__)}") ): hash(index) @@ -258,6 +201,13 @@ def test_append_numpy_bug_1681(self): result = a.append(c) assert (result["B"] == td).all() + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None + def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) @@ -336,11 +286,10 @@ def test_freq_conversion(self): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("unit", ["Y", "y", "M"]) - def test_unit_m_y_deprecated(self, unit): - with tm.assert_produces_warning(FutureWarning) as w: + def test_unit_m_y_raises(self, unit): + msg = "Units 'M' and 'Y' are no longer supported" + with pytest.raises(ValueError, match=msg): TimedeltaIndex([1, 3, 7], unit) - msg = r".* units are deprecated .*" - assert re.match(msg, str(w[0].message)) class TestTimeSeries: diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1c1d0f1a735cf..1cef9de6a3a77 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import timedelta_range, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, Second diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 2b4a6722666bf..477fc092a4e16 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -3,35 +3,13 @@ import numpy as np import pytest -from pandas._libs.tslib import iNaT - import pandas as pd from pandas import Series, TimedeltaIndex, isna, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltas: def test_to_timedelta(self): - def conv(v): - return v.astype("m8[ns]") - - d1 = np.timedelta64(1, "D") - - with tm.assert_produces_warning(FutureWarning): - assert to_timedelta("1 days 06:05:01.00003", box=False) == conv( - d1 - + np.timedelta64(6 * 3600 + 5 * 60 + 1, "s") - + np.timedelta64(30, "us") - ) - - with tm.assert_produces_warning(FutureWarning): - assert to_timedelta("15.5us", box=False) == conv( - np.timedelta64(15500, "ns") - ) - - # empty string - result = to_timedelta("", box=False) - assert result.astype("int64") == iNaT result = to_timedelta(["", ""]) assert isna(result).all() @@ -41,12 +19,6 @@ def conv(v): expected = pd.Index(np.array([np.timedelta64(1, "s")])) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - # ints - result = np.timedelta64(0, "ns") - expected = to_timedelta(0, box=False) - assert result == expected - # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) result = to_timedelta(Series(["1d", "1days 00:00:01"])) @@ -59,19 +31,6 @@ def conv(v): expected = to_timedelta([0, 10], unit="s") tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - # single element conversion - v = timedelta(seconds=1) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - assert result == expected - - with tm.assert_produces_warning(FutureWarning): - v = np.timedelta64(timedelta(seconds=1)) - result = to_timedelta(v, box=False) - expected = np.timedelta64(timedelta(seconds=1)) - assert result == expected - # arrays of various dtypes arr = np.array([1] * 5, dtype="int64") result = to_timedelta(arr, unit="s") @@ -98,28 +57,6 @@ def conv(v): expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - # Test with lists as input when box=false - expected = np.array(np.arange(3) * 1000000000, dtype="timedelta64[ns]") - result = to_timedelta(range(3), unit="s", box=False) - tm.assert_numpy_array_equal(expected, result) - - with tm.assert_produces_warning(FutureWarning): - result = to_timedelta(np.arange(3), unit="s", box=False) - tm.assert_numpy_array_equal(expected, result) - - with tm.assert_produces_warning(FutureWarning): - result = to_timedelta([0, 1, 2], unit="s", box=False) - tm.assert_numpy_array_equal(expected, result) - - with tm.assert_produces_warning(FutureWarning): - # Tests with fractional seconds as input: - expected = np.array( - [0, 500000000, 800000000, 1200000000], dtype="timedelta64[ns]" - ) - result = to_timedelta([0.0, 0.5, 0.8, 1.2], unit="s", box=False) - tm.assert_numpy_array_equal(expected, result) - def test_to_timedelta_invalid(self): # bad value for errors parameter @@ -136,8 +73,7 @@ def test_to_timedelta_invalid(self): # time not supported ATM msg = ( - "Value must be Timedelta, string, integer, float, timedelta or" - " convertible" + "Value must be Timedelta, string, integer, float, timedelta or convertible" ) with pytest.raises(ValueError, match=msg): to_timedelta(time(second=1)) @@ -208,13 +144,3 @@ def test_to_timedelta_float(self): result = pd.to_timedelta(arr, unit="s") expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype="int64") tm.assert_numpy_array_equal(result.asi8, expected_asi8) - - def test_to_timedelta_box_deprecated(self): - result = np.timedelta64(0, "ns") - - # Deprecated - see GH24416 - with tm.assert_produces_warning(FutureWarning): - to_timedelta(0, box=False) - - expected = to_timedelta(0).to_timedelta64() - assert result == expected diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index fea34f795bd03..3c027b035c2b8 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -1,17 +1,13 @@ """ common utilities """ import itertools -from warnings import catch_warnings, filterwarnings +from warnings import catch_warnings import numpy as np from pandas.core.dtypes.common import is_scalar from pandas import DataFrame, Float64Index, MultiIndex, Series, UInt64Index, date_range -import pandas.util.testing as tm - -from pandas.io.formats.printing import pprint_thing - -_verbose = False +import pandas._testing as tm def _mklbl(prefix, n): @@ -97,7 +93,7 @@ def setup_method(self, method): self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) self.frame_empty = DataFrame() - self.series_empty = Series() + self.series_empty = Series(dtype=object) # form agglomerates for kind in self._kinds: @@ -140,21 +136,18 @@ def get_result(self, obj, method, key, axis): return xp - def get_value(self, f, i, values=False): + def get_value(self, name, f, i, values=False): """ return the value for the location i """ # check against values if values: return f.values[i] - # this is equiv of f[col][row]..... - # v = f - # for a in reversed(i): - # v = v.__getitem__(a) - # return v - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - return f.ix[i] + elif name == "iat": + return f.iloc[i] + else: + assert name == "at" + return f.loc[i] def check_values(self, f, func, values=False): @@ -177,88 +170,41 @@ def check_values(self, f, func, values=False): tm.assert_almost_equal(result, expected) def check_result( - self, - name, - method1, - key1, - method2, - key2, - typs=None, - kinds=None, - axes=None, - fails=None, + self, method1, key1, method2, key2, typs=None, axes=None, fails=None, ): - def _eq(typ, kind, axis, obj, key1, key2): + def _eq(axis, obj, key1, key2): """ compare equal for these 2 keys """ if axis > obj.ndim - 1: return - def _print(result, error=None): - err = str(error) if error is not None else "" - msg = ( - "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," - "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" - % (name, result, typ, kind, method1, method2, axis, err) - ) - if _verbose: - pprint_thing(msg) - try: rs = getattr(obj, method1).__getitem__(_axify(obj, key1, axis)) - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - try: - xp = self.get_result( - obj=obj, method=method2, key=key2, axis=axis - ) - except (KeyError, IndexError): - # TODO: why is this allowed? - result = "no comp" - _print(result) - return + try: + xp = self.get_result(obj=obj, method=method2, key=key2, axis=axis) + except (KeyError, IndexError): + # TODO: why is this allowed? + return - detail = None + if is_scalar(rs) and is_scalar(xp): + assert rs == xp + else: + tm.assert_equal(rs, xp) - try: - if is_scalar(rs) and is_scalar(xp): - assert rs == xp - else: - tm.assert_equal(rs, xp) - result = "ok" - except AssertionError as exc: - detail = str(exc) - result = "fail" - - # reverse the checks - if fails is True: - if result == "fail": - result = "ok (fail)" - - _print(result) - if not result.startswith("ok"): - raise AssertionError(detail) - - except AssertionError: - raise except (IndexError, TypeError, KeyError) as detail: # if we are in fails, the ok, otherwise raise it if fails is not None: if isinstance(detail, fails): - result = "ok ({0.__name__})".format(type(detail)) - _print(result) + result = f"ok ({type(detail).__name__})" return result = type(detail).__name__ - raise AssertionError(_print(result, error=detail)) + raise AssertionError(result, detail) if typs is None: typs = self._typs - if kinds is None: - kinds = self._kinds - if axes is None: axes = [0, 1] elif not isinstance(axes, (tuple, list)): @@ -266,9 +212,7 @@ def _print(result, error=None): axes = [axes] # check - for kind in kinds: - if kind not in self._kinds: - continue + for kind in self._kinds: d = getattr(self, kind) for ax in axes: @@ -277,4 +221,4 @@ def _print(result, error=None): continue obj = d[typ] - _eq(typ=typ, kind=kind, axis=ax, obj=obj, key1=key1, key2=key2) + _eq(axis=ax, obj=obj, key1=key1, key2=key2) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index bbce786fc07ba..634020982b1c2 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: @@ -64,7 +64,7 @@ def test_non_matching(self): s = self.s # this is a departure from our current - # indexin scheme, but simpler + # indexing scheme, but simpler with pytest.raises(KeyError, match="^$"): s.loc[[-1, 3, 4, 5]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a86a9d16d3f9f..43036fbbd9844 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -4,7 +4,7 @@ import pytest from pandas import Interval, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index e58e6ed0d5d83..e6d5a9eb84410 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 3183721eeb54f..8bfba8c12e934 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -2,13 +2,13 @@ import pytest from pandas import DataFrame, MultiIndex, Series -from pandas.core import common as com -import pandas.util.testing as tm +import pandas._testing as tm +import pandas.core.common as com def test_detect_chained_assignment(): # Inplace ops, originally from: - # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] b = [123, None] c = [1234, 2345] diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 4f95e6bd28989..8ea825da8f94f 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # test indexing of Series with multi-level Index @@ -108,7 +108,7 @@ def test_series_getitem_indexing_errors( def test_series_getitem_corner_generator( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): s = multiindex_year_month_day_dataframe_random_data["A"] result = s[(x > 0 for x in s)] diff --git a/pandas/tests/indexing/multiindex/test_iloc.py b/pandas/tests/indexing/multiindex/test_iloc.py index 2c2e4d06f1ae3..9859c7235c380 100644 --- a/pandas/tests/indexing/multiindex/test_iloc.py +++ b/pandas/tests/indexing/multiindex/test_iloc.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index aab44daf8d17f..8ea1cebd7bf7b 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.slow diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 35f3137dac059..01b0b392d52a3 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndex: diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 9eeee897bfbb5..3b8aa963ac698 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -1,12 +1,10 @@ -import itertools - import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm @pytest.fixture @@ -50,7 +48,9 @@ def test_loc_getitem_series(self): empty = Series(data=[], dtype=np.float64) expected = Series( - [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype=np.float64, ) result = x.loc[empty] tm.assert_series_equal(result, expected) @@ -72,7 +72,9 @@ def test_loc_getitem_array(self): # empty array: empty = np.array([]) expected = Series( - [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + [], + index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64), + dtype="float64", ) result = x.loc[empty] tm.assert_series_equal(result, expected) @@ -223,17 +225,13 @@ def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices - index = MultiIndex.from_tuples( - [t for t in itertools.product([6, 7, 8], ["a", "b"])] - ) + index = MultiIndex.from_product([[6, 7, 8], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) - index = MultiIndex.from_tuples( - [t for t in itertools.product([10, 20, 30], ["a", "b"])] - ) + index = MultiIndex.from_product([[10, 20, 30], ["a", "b"]]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] @@ -413,3 +411,60 @@ def test_loc_setitem_single_column_slice(): df.loc[:, "B"] = np.arange(4) expected.iloc[:, 2] = np.arange(4) tm.assert_frame_equal(df, expected) + + +def test_loc_nan_multiindex(): + # GH 5286 + tups = [ + ("Good Things", "C", np.nan), + ("Good Things", "R", np.nan), + ("Bad Things", "C", np.nan), + ("Bad Things", "T", np.nan), + ("Okay Things", "N", "B"), + ("Okay Things", "N", "D"), + ("Okay Things", "B", np.nan), + ("Okay Things", "D", np.nan), + ] + df = DataFrame( + np.ones((8, 4)), + columns=Index(["d1", "d2", "d3", "d4"]), + index=MultiIndex.from_tuples(tups, names=["u1", "u2", "u3"]), + ) + result = df.loc["Good Things"].loc["C"] + expected = DataFrame( + np.ones((1, 4)), + index=Index([np.nan], dtype="object", name="u3"), + columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_period_string_indexing(): + # GH 9892 + a = pd.period_range("2013Q1", "2013Q4", freq="Q") + i = (1111, 2222, 3333) + idx = pd.MultiIndex.from_product((a, i), names=("Periode", "CVR")) + df = pd.DataFrame( + index=idx, + columns=( + "OMS", + "OMK", + "RES", + "DRIFT_IND", + "OEVRIG_IND", + "FIN_IND", + "VARE_UD", + "LOEN_UD", + "FIN_UD", + ), + ) + result = df.loc[("2013Q1", 1111), "OMS"] + expected = pd.Series( + [np.nan], + dtype=object, + name="OMS", + index=pd.MultiIndex.from_tuples( + [(pd.Period("2013Q1"), 1111)], names=["Periode", "CVR"] + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index bf1e999b06860..8163de8588232 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,12 +1,11 @@ import numpy as np -import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexBasic: @@ -20,7 +19,7 @@ def test_multiindex_perf_warn(self): } ).set_index(["jim", "joe"]) - with tm.assert_produces_warning(PerformanceWarning, clear=[pd.core.index]): + with tm.assert_produces_warning(PerformanceWarning): df.loc[(1, "z")] df = df.iloc[[2, 1, 3, 0]] @@ -47,17 +46,6 @@ def test_multiindex_contains_dropped(self): assert "a" in idx.levels[0] assert "a" not in idx - @pytest.mark.parametrize( - "data, expected", - [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([("a", "b"), (1, 2)]), False), - ], - ) - def test_multiindex_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 05ea949721b65..9d181bdcb9491 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexPartial: diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_set_ops.py index 66cb0d0d46380..f2cbfadb3cfa5 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_set_ops.py @@ -1,7 +1,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSetOps: diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 7fc95ba62a888..aebd1ad2573ed 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm class TestMultiIndexSetItem: @@ -141,7 +141,7 @@ def test_multiindex_setitem(self): df.loc["bar"] *= 2 # from SO - # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + # https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation df_orig = DataFrame.from_dict( { "price": { diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index f1f11285696f9..6fa9d3bd2cdbb 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -1,5 +1,3 @@ -from warnings import catch_warnings - import numpy as np import pytest @@ -7,12 +5,11 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +import pandas._testing as tm from pandas.core.indexing import _non_reducing_slice from pandas.tests.indexing.common import _mklbl -import pandas.util.testing as tm -@pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexSlicers: def test_per_axis_per_level_getitem(self): @@ -492,6 +489,44 @@ def test_loc_axis_arguments(self): with pytest.raises(ValueError): df.loc(axis="foo")[:, :, ["C1", "C3"]] + def test_loc_axis_single_level_multi_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1":"a2"] + expected = df.iloc[:, :-3] + + tm.assert_frame_equal(result, expected) + + def test_loc_axis_single_level_single_col_indexing_multiindex_col_df(self): + + # GH29519 + df = pd.DataFrame( + np.arange(27).reshape(3, 9), + columns=pd.MultiIndex.from_product( + [["a1", "a2", "a3"], ["b1", "b2", "b3"]] + ), + ) + result = df.loc(axis=1)["a1"] + expected = df.iloc[:, :3] + expected.columns = ["b1", "b2", "b3"] + + tm.assert_frame_equal(result, expected) + + def test_loc_ax_single_level_indexer_simple_df(self): + + # GH29519 + # test single level indexing on single index column data frame + df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=["a", "b", "c"]) + result = df.loc(axis=1)["a"] + expected = pd.Series(np.array([0, 3, 6]), name="a") + tm.assert_series_equal(result, expected) + def test_per_axis_per_level_setitem(self): # test index maker @@ -637,8 +672,6 @@ def test_multiindex_label_slicing_with_negative_step(self): def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) - with catch_warnings(record=True): - tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) assert_slices_equivalent(SLC[::-1], SLC[::-1]) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 5b8300827609a..4bec0f429a34e 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -2,7 +2,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSorted: diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index 99f343c2f4a7d..db8c0c643a623 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,11 +1,9 @@ -from itertools import product - import numpy as np import pytest from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture @@ -159,10 +157,8 @@ def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) - ids = "abcde" - index = MultiIndex.from_tuples( - [x for x in product(dates, ids)], names=["date", "secid"] - ) + ids = list("abcde") + index = MultiIndex.from_product([dates, ids], names=["date", "secid"]) df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) result = df.xs(20111201, level="date") @@ -211,7 +207,7 @@ def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data + multiindex_year_month_day_dataframe_random_data, ): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index aa73bd728595f..621417eb38d94 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingCallable: @@ -17,10 +17,14 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 - res = df.loc[lambda x: x.A > 2,] # noqa: E231 + res = df.loc[ + lambda x: x.A > 2, + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] @@ -90,7 +94,9 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ["A", "C"],] # noqa: E231 + res = df.loc[ + lambda x: ["A", "C"], + ] # noqa: E231 tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 6c81a00cb8f34..8c8dece53277e 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -12,10 +12,12 @@ Index, Interval, Series, + Timedelta, Timestamp, + conftest, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -import pandas.util.testing as tm class TestCategoricalIndex: @@ -72,14 +74,21 @@ def test_loc_scalar(self): df.loc["d"] = 10 msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): df.loc["d", "A"] = 10 with pytest.raises(TypeError, match=msg): df.loc["d", "C"] = 10 + msg = ( + r"cannot do label indexing on with these indexers \[1\] of " + ) + with pytest.raises(TypeError, match=msg): + df.loc[1] + def test_getitem_scalar(self): cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) @@ -356,8 +365,9 @@ def test_loc_listlike(self): # not all labels in the categories with pytest.raises( KeyError, - match="'a list-indexer must only include values that are in the" - " categories'", + match=( + "'a list-indexer must only include values that are in the categories'" + ), ): self.df2.loc[["a", "d"]] @@ -472,7 +482,7 @@ def test_getitem_with_listlike(self): [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats ) dummies = pd.get_dummies(cats) - result = dummies[[c for c in dummies.columns]] + result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) def test_setitem_listlike(self): @@ -645,22 +655,13 @@ def test_reindexing(self): df.reindex(["a"], limit=2) def test_loc_slice(self): - # slicing - # not implemented ATM # GH9748 - - msg = ( - "cannot do slice indexing on {klass} with these " - r"indexers \[1\] of {kind}".format( - klass=str(CategoricalIndex), kind=str(int) - ) - ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(KeyError, match="1"): self.df.loc[1:5] - # result = df.loc[1:5] - # expected = df.iloc[[1,2,3,4]] - # tm.assert_frame_equal(result, expected) + result = self.df.loc["b":"c"] + expected = self.df.iloc[[2, 3, 4]] + tm.assert_frame_equal(result, expected) def test_loc_and_at_with_categorical_index(self): # GH 20629 @@ -754,3 +755,68 @@ def test_map_with_dict_or_series(self): output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) + + @pytest.mark.parametrize( + "idx_values", + [ + # python types + [1, 2, 3], + [-1, -2, -3], + [1.5, 2.5, 3.5], + [-1.5, -2.5, -3.5], + # numpy int/uint + *[np.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_INT_DTYPES], + # numpy floats + *[np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in conftest.FLOAT_DTYPES], + # numpy object + np.array([1, "b", 3.5], dtype=object), + # pandas scalars + [Interval(1, 4), Interval(4, 6), Interval(6, 9)], + [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)], + [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], + # pandas Integer arrays + *[pd.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_EA_INT_DTYPES], + # other pandas arrays + pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, + pd.date_range("2019-01-01", periods=3).array, + pd.timedelta_range(start="1d", periods=3).array, + ], + ) + def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): + # GH-17569 + cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) + df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) + sl = slice(idx_values[0], idx_values[1]) + + # scalar selection + result = df.loc[idx_values[0]] + expected = Series(["foo"], index=["A"], name=idx_values[0]) + tm.assert_series_equal(result, expected) + + # list selection + result = df.loc[idx_values[:2]] + expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) + tm.assert_frame_equal(result, expected) + + # slice selection + result = df.loc[sl] + expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) + tm.assert_frame_equal(result, expected) + + # scalar assignment + result = df.copy() + result.loc[idx_values[0]] = "qux" + expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) + + # list assignment + result = df.copy() + result.loc[idx_values[:2], "A"] = ["qux", "qux2"] + expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) + + # slice assignment + result = df.copy() + result.loc[sl, "A"] = ["qux", "qux2"] + expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index bd106ba9c36f1..e845487ffca9a 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range, option_context -from pandas.core import common as com -import pandas.util.testing as tm +import pandas._testing as tm +import pandas.core.common as com class TestCaching: @@ -273,7 +273,7 @@ def random_text(nobs=100): str(df) # from SO: - # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" @@ -361,13 +361,12 @@ def check(result, expected): result4 = df["A"].iloc[2] check(result4, expected) - @pytest.mark.filterwarnings("ignore::FutureWarning") def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem df = tm.makeDataFrame() df["A"] # cache series - df.ix["Hello Friend"] = df.ix[0] + df.loc["Hello Friend"] = df.iloc[0] assert "Hello Friend" in df["A"].index assert "Hello Friend" in df["B"].index @@ -393,14 +392,3 @@ def test_cache_updating(self): tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - - def test_deprecate_is_copy(self): - # GH18801 - df = DataFrame({"A": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # getter - df.is_copy - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # setter - df.is_copy = "test deprecated is_copy" diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 4f38d7beb9c0b..b904755b099d0 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -7,7 +7,7 @@ import pandas.compat as compat import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm ############################################################### # Index / Series common tests which may trigger dtype coercions @@ -432,13 +432,19 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): ) self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) - msg = "Passed item and index have different timezone" if fill_val.tz: - with pytest.raises(ValueError, match=msg): + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): obj.insert(1, pd.Timestamp("2012-01-01")) - with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + msg = "Timezones don't match" + with pytest.raises(ValueError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + + else: + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) msg = "cannot insert DatetimeIndex with incompatible label" with pytest.raises(TypeError, match=msg): @@ -479,22 +485,20 @@ def test_insert_index_period(self, insert, coerced_val, coerced_dtype): obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") assert obj.dtype == "period[M]" + data = [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] if isinstance(insert, pd.Period): - index_type = pd.PeriodIndex + exp = pd.PeriodIndex(data, freq="M") + self._assert_insert_conversion(obj, insert, exp, coerced_dtype) else: - index_type = pd.Index - - exp = index_type( - [ - pd.Period("2011-01", freq="M"), - coerced_val, - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-04", freq="M"), - ], - freq="M", - ) - self._assert_insert_conversion(obj, insert, exp, coerced_dtype) + msg = r"Unexpected keyword arguments {'freq'}" + with pytest.raises(TypeError, match=msg): + pd.Index(data, freq="M") def test_insert_index_complex128(self): pass @@ -515,12 +519,12 @@ def _assert_where_conversion( res = target.where(cond, values) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val,exp_dtype", [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], ) - def test_where_object(self, klass, fill_val, exp_dtype): + def test_where_object(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series obj = klass(list("abcd")) assert obj.dtype == np.object cond = klass([True, False, True, False]) @@ -541,12 +545,12 @@ def test_where_object(self, klass, fill_val, exp_dtype): exp = klass(["a", values[1], "c", values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val,exp_dtype", [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], ) - def test_where_int64(self, klass, fill_val, exp_dtype): + def test_where_int64(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") obj = klass([1, 2, 3, 4]) @@ -563,7 +567,6 @@ def test_where_int64(self, klass, fill_val, exp_dtype): exp = klass([1, values[1], 3, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val, exp_dtype", [ @@ -573,7 +576,8 @@ def test_where_int64(self, klass, fill_val, exp_dtype): (True, np.object), ], ) - def test_where_float64(self, klass, fill_val, exp_dtype): + def test_where_float64(self, index_or_series, fill_val, exp_dtype): + klass = index_or_series if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") obj = klass([1.1, 2.2, 3.3, 4.4]) @@ -783,19 +787,18 @@ def _assert_fillna_conversion(self, original, value, expected, expected_dtype): res = target.fillna(value) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val, fill_dtype", [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], ) - def test_fillna_object(self, klass, fill_val, fill_dtype): + def test_fillna_object(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series obj = klass(["a", np.nan, "c", "d"]) assert obj.dtype == np.object exp = klass(["a", fill_val, "c", "d"]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val,fill_dtype", [ @@ -805,7 +808,8 @@ def test_fillna_object(self, klass, fill_val, fill_dtype): (True, np.object), ], ) - def test_fillna_float64(self, klass, fill_val, fill_dtype): + def test_fillna_float64(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -833,7 +837,6 @@ def test_fillna_series_complex128(self, fill_val, fill_dtype): exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) @pytest.mark.parametrize( "fill_val,fill_dtype", [ @@ -844,7 +847,8 @@ def test_fillna_series_complex128(self, fill_val, fill_dtype): ], ids=["datetime64", "datetime64tz", "object", "object"], ) - def test_fillna_datetime(self, klass, fill_val, fill_dtype): + def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series obj = klass( [ pd.Timestamp("2011-01-01"), @@ -865,7 +869,6 @@ def test_fillna_datetime(self, klass, fill_val, fill_dtype): ) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index]) @pytest.mark.parametrize( "fill_val,fill_dtype", [ @@ -876,7 +879,8 @@ def test_fillna_datetime(self, klass, fill_val, fill_dtype): ("x", np.object), ], ) - def test_fillna_datetime64tz(self, klass, fill_val, fill_dtype): + def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): + klass = index_or_series tz = "US/Eastern" obj = klass( @@ -929,7 +933,7 @@ class TestReplaceSeriesCoercion(CoercionBase): klasses = ["series"] method = "replace" - rep = {} # type: Dict[str, List] + rep: Dict[str, List] = {} rep["object"] = ["a", "b"] rep["int64"] = [4, 5] rep["float64"] = [1.1, 2.2] @@ -988,10 +992,6 @@ class TestReplaceSeriesCoercion(CoercionBase): ], ) def test_replace_series(self, how, to_key, from_key): - if from_key == "bool" and how == "series": - # doesn't work in PY3, though ...dict_from_bool works fine - pytest.skip("doesn't work as in PY3") - index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index ab4a8fe89c6e3..42f992339f036 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -2,10 +2,11 @@ from dateutil import tz import numpy as np +import pytest import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: @@ -242,11 +243,8 @@ def test_series_partial_set_datetime(self): Timestamp("2011-01-02"), Timestamp("2011-01-03"), ] - exp = Series( - [np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name="idx"), name="s" - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[keys] def test_series_partial_set_period(self): # GH 11497 @@ -273,12 +271,8 @@ def test_series_partial_set_period(self): pd.Period("2011-01-02", freq="D"), pd.Period("2011-01-03", freq="D"), ] - exp = Series( - [np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name="idx"), name="s" - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[keys] - tm.assert_series_equal(result, exp) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[keys] def test_nanosecond_getitem_setitem_with_tz(self): # GH 11679 diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index eadaeaba63a26..2cc8232566aa9 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestFloatIndexers: @@ -90,25 +90,30 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}|" - "Cannot index by location index with a" - " non-integer key".format(klass=type(i), kind=str(float)) + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}|" + "Cannot index by location index with a " + "non-integer key".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): idxr(s)[3.0] # label based can be a TypeError or KeyError - if s.index.inferred_type in ["string", "unicode", "mixed"]: + if s.index.inferred_type in { + "categorical", + "string", + "unicode", + "mixed", + }: error = KeyError msg = r"^3$" else: error = TypeError msg = ( - r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -132,9 +137,8 @@ def test_scalar_non_numeric(self): elif s.index.inferred_type in ["datetime64", "timedelta64", "period"]: # these should prob work - # and are inconsisten between series/dataframe ATM - # for idxr in [lambda x: x.ix, - # lambda x: x]: + # and are inconsistent between series/dataframe ATM + # for idxr in [lambda x: x]: # s2 = s.copy() # # with pytest.raises(TypeError): @@ -340,9 +344,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] @@ -350,10 +354,10 @@ def test_slice_non_numeric(self): for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})".format( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( klass=type(index), kind_float=str(float), kind_int=str(int), @@ -366,9 +370,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 @@ -420,9 +424,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -444,9 +448,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-6\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-6\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] @@ -470,9 +474,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|3)\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|3)\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -488,9 +492,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -511,9 +515,9 @@ def test_integer_positional_indexing(self): klass = RangeIndex msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|4)\.0\] of" - " {kind}".format(klass=str(klass), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|4)\.0\] of " + "{kind}".format(klass=str(klass), kind=str(float)) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -536,9 +540,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(0|1)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -551,9 +555,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-10\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-10\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] @@ -570,9 +574,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[0\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[0\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -587,9 +591,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -726,25 +730,15 @@ def test_floating_misc(self): tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, result4) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result1 = s[[1.6, 5, 10]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result2 = s.loc[[1.6, 5, 10]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result3 = s.loc[[1.6, 5, 10]] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - tm.assert_series_equal(result1, Series([np.nan, 2, 4], index=[1.6, 5, 10])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result1 = s[[0, 1, 2]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result2 = s.loc[[0, 1, 2]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result3 = s.loc[[0, 1, 2]] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - tm.assert_series_equal(result1, Series([0.0, np.nan, np.nan], index=[0, 1, 2])) + with pytest.raises(KeyError, match="with any missing labels"): + s[[1.6, 5, 10]] + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[1.6, 5, 10]] + + with pytest.raises(KeyError, match="with any missing labels"): + s[[0, 1, 2]] + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[0, 1, 2]] result1 = s.loc[[2.5, 5]] result2 = s.loc[[2.5, 5]] diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 7c1d8ddd14317..26dedf02e7333 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1,16 +1,17 @@ """ test positional based indexing with iloc """ -from warnings import catch_warnings, filterwarnings, simplefilter +from datetime import datetime +from warnings import catch_warnings, simplefilter import numpy as np import pytest import pandas as pd from pandas import DataFrame, Series, concat, date_range, isna +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.core.indexing import IndexingError from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestiLoc(Base): @@ -122,7 +123,7 @@ def check(result, expected): [ ([slice(None), ["A", "D"]]), (["1", "2"], slice(None)), - ([pd.datetime(2019, 1, 1)], slice(None)), + ([datetime(2019, 1, 1)], slice(None)), ], ) def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): @@ -135,32 +136,22 @@ def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): df.iloc[index_vals, column_vals] def test_iloc_getitem_int(self): - # integer self.check_result( - "integer", "iloc", 2, "ix", {0: 4, 1: 6, 2: 8}, typs=["ints", "uints"] - ) - self.check_result( - "integer", "iloc", 2, - "indexer", + "iloc", 2, typs=["labels", "mixed", "ts", "floats", "empty"], fails=IndexError, ) def test_iloc_getitem_neg_int(self): - # neg integer self.check_result( - "neg int", "iloc", -1, "ix", {0: 6, 1: 9, 2: 12}, typs=["ints", "uints"] - ) - self.check_result( - "neg int", "iloc", -1, - "indexer", + "iloc", -1, typs=["labels", "mixed", "ts", "floats", "empty"], fails=IndexError, @@ -193,61 +184,17 @@ def test_iloc_array_not_mutating_negative_indices(self): tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) def test_iloc_getitem_list_int(self): - - # list of ints self.check_result( - "list int", "iloc", [0, 1, 2], - "ix", - {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=["ints", "uints"], - ) - self.check_result( - "list int", - "iloc", - [2], - "ix", - {0: [4], 1: [6], 2: [8]}, - typs=["ints", "uints"], - ) - self.check_result( - "list int", "iloc", [0, 1, 2], - "indexer", - [0, 1, 2], typs=["labels", "mixed", "ts", "floats", "empty"], fails=IndexError, ) # array of ints (GH5006), make sure that a single indexer is returning # the correct type - self.check_result( - "array int", - "iloc", - np.array([0, 1, 2]), - "ix", - {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=["ints", "uints"], - ) - self.check_result( - "array int", - "iloc", - np.array([2]), - "ix", - {0: [4], 1: [6], 2: [8]}, - typs=["ints", "uints"], - ) - self.check_result( - "array int", - "iloc", - np.array([0, 1, 2]), - "indexer", - [0, 1, 2], - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) def test_iloc_getitem_neg_int_can_reach_first_index(self): # GH10547 and GH10779 @@ -277,17 +224,6 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): tm.assert_series_equal(result, expected) def test_iloc_getitem_dups(self): - - self.check_result( - "list int (dups)", - "iloc", - [0, 1, 1, 3], - "ix", - {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - kinds=["series", "frame"], - typs=["ints", "uints"], - ) - # GH 6766 df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) @@ -302,32 +238,12 @@ def test_iloc_getitem_dups(self): tm.assert_series_equal(result, expected) def test_iloc_getitem_array(self): - - # array like - s = Series(index=range(1, 4)) - self.check_result( - "array like", - "iloc", - s.index, - "ix", - {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, - typs=["ints", "uints"], - ) + # TODO: test something here? + pass def test_iloc_getitem_bool(self): - - # boolean indexers - b = [True, False, True, False] - self.check_result("bool", "iloc", b, "ix", b, typs=["ints", "uints"]) - self.check_result( - "bool", - "iloc", - b, - "ix", - b, - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) + # TODO: test something here? + pass @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_iloc_getitem_bool_diff_len(self, index): @@ -340,25 +256,8 @@ def test_iloc_getitem_bool_diff_len(self, index): _ = s.iloc[index] def test_iloc_getitem_slice(self): - - # slices - self.check_result( - "slice", - "iloc", - slice(1, 3), - "ix", - {0: [2, 4], 1: [3, 6], 2: [4, 8]}, - typs=["ints", "uints"], - ) - self.check_result( - "slice", - "iloc", - slice(1, 3), - "indexer", - slice(1, 3), - typs=["labels", "mixed", "ts", "floats", "empty"], - fails=IndexError, - ) + # TODO: test something here? + pass def test_iloc_getitem_slice_dups(self): @@ -463,69 +362,53 @@ def test_iloc_setitem_dups(self): df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) + # TODO: GH#27620 this test used to compare iloc against ix; check if this + # is redundant with another test comparing iloc against loc def test_iloc_getitem_frame(self): df = DataFrame( np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0, 8, 2) ) result = df.iloc[2] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - exp = df.ix[4] + exp = df.loc[4] tm.assert_series_equal(result, exp) result = df.iloc[2, 2] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - exp = df.ix[4, 4] + exp = df.loc[4, 4] assert result == exp # slice result = df.iloc[4:8] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[8:14] + expected = df.loc[8:14] tm.assert_frame_equal(result, expected) result = df.iloc[:, 2:3] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[:, 4:5] + expected = df.loc[:, 4:5] tm.assert_frame_equal(result, expected) # list of integers result = df.iloc[[0, 1, 3]] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[[0, 2, 6]] + expected = df.loc[[0, 2, 6]] tm.assert_frame_equal(result, expected) result = df.iloc[[0, 1, 3], [0, 1]] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[[0, 2, 6], [0, 2]] + expected = df.loc[[0, 2, 6], [0, 2]] tm.assert_frame_equal(result, expected) # neg indices result = df.iloc[[-1, 1, 3], [-1, 1]] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[[18, 2, 6], [6, 2]] + expected = df.loc[[18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # dups indices result = df.iloc[[-1, -1, 1, 3], [-1, 1]] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[[18, 18, 2, 6], [6, 2]] + expected = df.loc[[18, 18, 2, 6], [6, 2]] tm.assert_frame_equal(result, expected) # with index-like - s = Series(index=range(1, 5)) + s = Series(index=range(1, 5), dtype=object) result = df.iloc[s.index] - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - expected = df.ix[[2, 4, 6, 8]] + expected = df.loc[[2, 4, 6, 8]] tm.assert_frame_equal(result, expected) def test_iloc_getitem_labelled_frame(self): @@ -750,20 +633,8 @@ def test_iloc_non_unique_indexing(self): df2 = DataFrame({"A": [0.1] * 1000, "B": [1] * 1000}) df2 = concat([df2, 2 * df2, 3 * df2]) - sidx = df2.index.to_series() - expected = df2.iloc[idx[idx <= sidx.max()]] - - new_list = [] - for r, s in expected.iterrows(): - new_list.append(s) - new_list.append(s * 2) - new_list.append(s * 3) - - expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], sort=True) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df2.loc[idx] - tm.assert_frame_equal(result, expected, check_index_type=False) + with pytest.raises(KeyError, match="with any missing labels"): + df2.loc[idx] def test_iloc_empty_list_indexer_is_ok(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d611dc5497cca..448a06070c45c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,18 +7,17 @@ import numpy as np import pytest -from pandas.compat import PY36 from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd from pandas import DataFrame, Index, NaT, Series +import pandas._testing as tm from pandas.core.generic import NDFrame from pandas.core.indexers import validate_indices from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice from pandas.tests.indexing.common import Base, _mklbl -import pandas.util.testing as tm # ------------------------------------------------------------------------ # Indexing test cases @@ -84,12 +83,9 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "No matching signature found|" # TypeError - "unhashable type: 'numpy.ndarray'" # TypeError + "Index data must be 1-dimensional" ) if ( @@ -105,21 +101,12 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): "categorical", ] ): - idxr[nd3] - else: - if ( - isinstance(obj, DataFrame) - and idxr_id == "getitem" - and index.inferred_type == "boolean" - ): - error = TypeError - elif idxr_id == "getitem" and index.inferred_type == "interval": - error = TypeError - else: - error = ValueError - - with pytest.raises(error, match=msg): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] + else: + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(DeprecationWarning): + idxr[nd3] @pytest.mark.parametrize( "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ @@ -149,14 +136,12 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" - "'pandas._libs.interval.IntervalTree' object has no attribute" - " 'set_value'|" # AttributeError + "'pandas._libs.interval.IntervalTree' object has no attribute " + "'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError "No matching signature found|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError + r"^\[\[\[|" # pandas.core.indexing.IndexingError + "Index data must be 1-dimensional" ) if (idxr_id == "iloc") or ( @@ -177,10 +162,8 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): ): idxr[nd3] = 0 else: - with pytest.raises( - (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), - match=msg, - ): + err = (ValueError, AttributeError) + with pytest.raises(err, match=msg): idxr[nd3] = 0 def test_inf_upcast(self): @@ -222,7 +205,7 @@ def test_setitem_dtype_upcast(self): expected = DataFrame( [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}] ) - tm.assert_frame_equal(df, expected, check_like=not PY36) + tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame( @@ -300,32 +283,13 @@ def test_dups_fancy_indexing(self): tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] - expected = DataFrame( - { - "test": [11, 9, np.nan], - "test1": [7.0, 6, np.nan], - "other": ["d", "c", np.nan], - }, - index=rows, - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[rows] - tm.assert_frame_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[rows] # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] - expected = DataFrame( - { - "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], - "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], - "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], - }, - index=rows, - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[rows] - tm.assert_frame_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[rows] # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) @@ -341,38 +305,25 @@ def test_dups_fancy_indexing(self): # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[[0, 8, 0]] - expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) - tm.assert_frame_equal(result, expected, check_index_type=False) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[[0, 8, 0]] df = DataFrame({"A": list("abc")}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[[0, 8, 0]] - expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) - tm.assert_frame_equal(result, expected, check_index_type=False) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[[0, 8, 0]] # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) - expected = DataFrame( - {"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"] - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[["A", "A", "E"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[["A", "A", "E"]] def test_dups_fancy_indexing2(self): # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) - expected = pd.concat( - [df.loc[:, ["A", "B"]], DataFrame(np.nan, columns=["C"], index=df.index)], - axis=1, - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[:, ["A", "B", "C"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + df.loc[:, ["A", "B", "C"]] # GH 6504, multi-axis indexing df = DataFrame( @@ -591,12 +542,12 @@ class TO: def __init__(self, value): self.value = value - def __str__(self): + def __str__(self) -> str: return "[{0}]".format(self.value) __repr__ = __str__ - def __eq__(self, other): + def __eq__(self, other) -> bool: return self.value == other.value def view(self): @@ -928,7 +879,7 @@ def test_range_in_series_indexing(self): # range can cause an indexing error # GH 11652 for x in [5, 999999, 1000000]: - s = Series(index=range(x)) + s = Series(index=range(x), dtype=np.float64) s.loc[range(1)] = 42 tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) @@ -1172,12 +1123,12 @@ def test_extension_array_cross_section_converts(): ( lambda x: x.loc, AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + "type object 'NDFrame' has no attribute '_AXIS_NAMES'", ), ( lambda x: x.iloc, AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + "type object 'NDFrame' has no attribute '_AXIS_NAMES'", ), ], ) @@ -1210,3 +1161,26 @@ def test_1tuple_without_multiindex(): result = ser[key] expected = ser[key[0]] tm.assert_series_equal(result, expected) + + +def test_duplicate_index_mistyped_key_raises_keyerror(): + # GH#29189 float_index.get_loc(None) should raise KeyError, not TypeError + ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) + with pytest.raises(KeyError): + ser[None] + + with pytest.raises(KeyError): + ser.index.get_loc(None) + + with pytest.raises(KeyError): + ser.index._engine.get_loc(None) + + +def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): + # GH 30567 + ser = pd.Series([None] * 10) + mask = [False] * 3 + [True] * 5 + [False] * 2 + ser[mask] = range(5) + result = ser + expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py index 7303c1ff3d111..edb5d7d7f3a57 100644 --- a/pandas/tests/indexing/test_indexing_engines.py +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -2,7 +2,7 @@ from pandas._libs import algos as libalgos, index as libindex -import pandas.util.testing as tm +import pandas._testing as tm class TestNumericEngine: diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index bf8c6afd00561..2ffa44bec14a6 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingSlow: diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py deleted file mode 100644 index a46cd65162f4e..0000000000000 --- a/pandas/tests/indexing/test_ix.py +++ /dev/null @@ -1,354 +0,0 @@ -""" test indexing with ix """ - -from warnings import catch_warnings - -import numpy as np -import pytest - -from pandas.core.dtypes.common import is_scalar - -import pandas as pd -from pandas import DataFrame, Series, option_context -import pandas.util.testing as tm - - -def test_ix_deprecation(): - # GH 15114 - - df = DataFrame({"A": [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - df.ix[1, "A"] - - -@pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") -class TestIX: - def test_ix_loc_setitem_consistency(self): - - # GH 5771 - # loc with slice and series - s = Series(0, index=[4, 5, 6]) - s.loc[4:5] += 1 - expected = Series([1, 1, 0], index=[4, 5, 6]) - tm.assert_series_equal(s, expected) - - # GH 5928 - # chained indexing assignment - df = DataFrame({"a": [0, 1, 2]}) - expected = df.copy() - with catch_warnings(record=True): - expected.ix[[0, 1, 2], "a"] = -expected.ix[[0, 1, 2], "a"] - - with catch_warnings(record=True): - df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]] - tm.assert_frame_equal(df, expected) - - df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]}) - with catch_warnings(record=True): - df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]].astype("float64") + 0.5 - expected = DataFrame({"a": [0.5, -0.5, -1.5], "b": [0, 1, 2]}) - tm.assert_frame_equal(df, expected) - - # GH 8607 - # ix setitem consistency - df = DataFrame( - { - "delta": [1174, 904, 161], - "elapsed": [7673, 9277, 1470], - "timestamp": [1413840976, 1413842580, 1413760580], - } - ) - expected = DataFrame( - { - "delta": [1174, 904, 161], - "elapsed": [7673, 9277, 1470], - "timestamp": pd.to_datetime( - [1413840976, 1413842580, 1413760580], unit="s" - ), - } - ) - - df2 = df.copy() - df2["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") - tm.assert_frame_equal(df2, expected) - - df2 = df.copy() - df2.loc[:, "timestamp"] = pd.to_datetime(df["timestamp"], unit="s") - tm.assert_frame_equal(df2, expected) - - df2 = df.copy() - with catch_warnings(record=True): - df2.ix[:, 2] = pd.to_datetime(df["timestamp"], unit="s") - tm.assert_frame_equal(df2, expected) - - def test_ix_loc_consistency(self): - - # GH 8613 - # some edge cases where ix/loc should return the same - # this is not an exhaustive case - - def compare(result, expected): - if is_scalar(expected): - assert result == expected - else: - assert expected.equals(result) - - # failure cases for .loc, but these work for .ix - df = DataFrame(np.random.randn(5, 4), columns=list("ABCD")) - for key in [ - slice(1, 3), - tuple([slice(0, 2), slice(0, 2)]), - tuple([slice(0, 2), df.columns[0:2]]), - ]: - - for index in [ - tm.makeStringIndex, - tm.makeUnicodeIndex, - tm.makeDateIndex, - tm.makePeriodIndex, - tm.makeTimedeltaIndex, - ]: - df.index = index(len(df.index)) - with catch_warnings(record=True): - df.ix[key] - - msg = ( - r"cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\] of" - r" {kind}".format(klass=type(df.index), kind=str(int)) - ) - with pytest.raises(TypeError, match=msg): - df.loc[key] - - df = DataFrame( - np.random.randn(5, 4), - columns=list("ABCD"), - index=pd.date_range("2012-01-01", periods=5), - ) - - for key in [ - "2012-01-03", - "2012-01-31", - slice("2012-01-03", "2012-01-03"), - slice("2012-01-03", "2012-01-04"), - slice("2012-01-03", "2012-01-06", 2), - slice("2012-01-03", "2012-01-31"), - tuple([[True, True, True, False, True]]), - ]: - - # getitem - - # if the expected raises, then compare the exceptions - try: - with catch_warnings(record=True): - expected = df.ix[key] - except KeyError: - with pytest.raises(KeyError, match=r"^'2012-01-31'$"): - df.loc[key] - continue - - result = df.loc[key] - compare(result, expected) - - # setitem - df1 = df.copy() - df2 = df.copy() - - with catch_warnings(record=True): - df1.ix[key] = 10 - df2.loc[key] = 10 - compare(df2, df1) - - # edge cases - s = Series([1, 2, 3, 4], index=list("abde")) - - result1 = s["a":"c"] - with catch_warnings(record=True): - result2 = s.ix["a":"c"] - result3 = s.loc["a":"c"] - tm.assert_series_equal(result1, result2) - tm.assert_series_equal(result1, result3) - - # now work rather than raising KeyError - s = Series(range(5), [-2, -1, 1, 2, 3]) - - with catch_warnings(record=True): - result1 = s.ix[-10:3] - result2 = s.loc[-10:3] - tm.assert_series_equal(result1, result2) - - with catch_warnings(record=True): - result1 = s.ix[0:3] - result2 = s.loc[0:3] - tm.assert_series_equal(result1, result2) - - def test_ix_weird_slicing(self): - # http://stackoverflow.com/q/17056560/1240268 - df = DataFrame({"one": [1, 2, 3, np.nan, np.nan], "two": [1, 2, 3, 4, 5]}) - df.loc[df["one"] > 1, "two"] = -df["two"] - - expected = DataFrame( - { - "one": {0: 1.0, 1: 2.0, 2: 3.0, 3: np.nan, 4: np.nan}, - "two": {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}, - } - ) - tm.assert_frame_equal(df, expected) - - def test_ix_assign_column_mixed(self, float_frame): - # GH #1142 - df = float_frame - df["foo"] = "bar" - - orig = df.loc[:, "B"].copy() - df.loc[:, "B"] = df.loc[:, "B"] + 1 - tm.assert_series_equal(df.B, orig + 1) - - # GH 3668, mixed frame with series value - df = DataFrame({"x": np.arange(10), "y": np.arange(10, 20), "z": "bar"}) - expected = df.copy() - - for i in range(5): - indexer = i * 2 - v = 1000 + i * 200 - expected.loc[indexer, "y"] = v - assert expected.loc[indexer, "y"] == v - - df.loc[df.x % 2 == 0, "y"] = df.loc[df.x % 2 == 0, "y"] * 100 - tm.assert_frame_equal(df, expected) - - # GH 4508, making sure consistency of assignments - df = DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) - df.loc[[0, 2], "b"] = [100, -100] - expected = DataFrame({"a": [1, 2, 3], "b": [100, 1, -100]}) - tm.assert_frame_equal(df, expected) - - df = DataFrame({"a": list(range(4))}) - df["b"] = np.nan - df.loc[[1, 3], "b"] = [100, -100] - expected = DataFrame({"a": [0, 1, 2, 3], "b": [np.nan, 100, np.nan, -100]}) - tm.assert_frame_equal(df, expected) - - # ok, but chained assignments are dangerous - # if we turn off chained assignment it will work - with option_context("chained_assignment", None): - df = DataFrame({"a": list(range(4))}) - df["b"] = np.nan - df["b"].loc[[1, 3]] = [100, -100] - tm.assert_frame_equal(df, expected) - - def test_ix_get_set_consistency(self): - - # GH 4544 - # ix/loc get/set not consistent when - # a mixed int/string index - df = DataFrame( - np.arange(16).reshape((4, 4)), - columns=["a", "b", 8, "c"], - index=["e", 7, "f", "g"], - ) - - with catch_warnings(record=True): - assert df.ix["e", 8] == 2 - assert df.loc["e", 8] == 2 - - with catch_warnings(record=True): - df.ix["e", 8] = 42 - assert df.ix["e", 8] == 42 - assert df.loc["e", 8] == 42 - - df.loc["e", 8] = 45 - with catch_warnings(record=True): - assert df.ix["e", 8] == 45 - assert df.loc["e", 8] == 45 - - def test_ix_slicing_strings(self): - # see gh-3836 - data = { - "Classification": ["SA EQUITY CFD", "bbb", "SA EQUITY", "SA SSF", "aaa"], - "Random": [1, 2, 3, 4, 5], - "X": ["correct", "wrong", "correct", "correct", "wrong"], - } - df = DataFrame(data) - x = df[~df.Classification.isin(["SA EQUITY CFD", "SA EQUITY", "SA SSF"])] - with catch_warnings(record=True): - df.ix[x.index, "X"] = df["Classification"] - - expected = DataFrame( - { - "Classification": { - 0: "SA EQUITY CFD", - 1: "bbb", - 2: "SA EQUITY", - 3: "SA SSF", - 4: "aaa", - }, - "Random": {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, - "X": {0: "correct", 1: "bbb", 2: "correct", 3: "correct", 4: "aaa"}, - } - ) # bug was 4: 'bbb' - - tm.assert_frame_equal(df, expected) - - def test_ix_setitem_out_of_bounds_axis_0(self): - df = DataFrame( - np.random.randn(2, 5), - index=["row{i}".format(i=i) for i in range(2)], - columns=["col{i}".format(i=i) for i in range(5)], - ) - with catch_warnings(record=True): - msg = "cannot set by positional indexing with enlargement" - with pytest.raises(ValueError, match=msg): - df.ix[2, 0] = 100 - - def test_ix_setitem_out_of_bounds_axis_1(self): - df = DataFrame( - np.random.randn(5, 2), - index=["row{i}".format(i=i) for i in range(5)], - columns=["col{i}".format(i=i) for i in range(2)], - ) - with catch_warnings(record=True): - msg = "cannot set by positional indexing with enlargement" - with pytest.raises(ValueError, match=msg): - df.ix[0, 2] = 100 - - def test_ix_empty_list_indexer_is_ok(self): - with catch_warnings(record=True): - - df = tm.makeCustomDataframe(5, 2) - # vertical empty - tm.assert_frame_equal( - df.ix[:, []], - df.iloc[:, :0], - check_index_type=True, - check_column_type=True, - ) - # horizontal empty - tm.assert_frame_equal( - df.ix[[], :], - df.iloc[:0, :], - check_index_type=True, - check_column_type=True, - ) - # horizontal empty - tm.assert_frame_equal( - df.ix[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True - ) - - def test_ix_duplicate_returns_series(self): - df = DataFrame( - np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") - ) - with catch_warnings(record=True): - r = df.ix[0.2, "a"] - e = df.loc[0.2, "a"] - tm.assert_series_equal(r, e) - - def test_ix_intervalindex(self): - # https://github.com/pandas-dev/pandas/issues/27865 - df = DataFrame( - np.random.randn(5, 2), - index=pd.IntervalIndex.from_breaks([-np.inf, 0, 1, 2, 3, np.inf]), - ) - result = df.ix[0:2, 0] - expected = df.iloc[0:2, 0] - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 732914b3b8947..a36078b11c663 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,16 +1,15 @@ """ test label based indexing with loc """ from io import StringIO import re -from warnings import catch_warnings, filterwarnings import numpy as np import pytest import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestLoc(Base): @@ -96,185 +95,90 @@ def test_loc_setitem_slice(self): def test_loc_getitem_int(self): # int label - self.check_result( - "int label", "loc", 2, "ix", 2, typs=["ints", "uints"], axes=0 - ) - self.check_result( - "int label", "loc", 3, "ix", 3, typs=["ints", "uints"], axes=1 - ) - self.check_result( - "int label", "loc", 2, "ix", 2, typs=["label"], fails=KeyError - ) + self.check_result("loc", 2, "loc", 2, typs=["label"], fails=KeyError) def test_loc_getitem_label(self): # label - self.check_result("label", "loc", "c", "ix", "c", typs=["labels"], axes=0) - self.check_result("label", "loc", "null", "ix", "null", typs=["mixed"], axes=0) - self.check_result("label", "loc", 8, "ix", 8, typs=["mixed"], axes=0) - self.check_result( - "label", "loc", Timestamp("20130102"), "ix", 1, typs=["ts"], axes=0 - ) - self.check_result( - "label", "loc", "c", "ix", "c", typs=["empty"], fails=KeyError - ) + self.check_result("loc", "c", "loc", "c", typs=["empty"], fails=KeyError) def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result( - "label range", "loc", "f", - "ix", + "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError, ) + self.check_result("loc", "f", "ix", "f", typs=["floats"], fails=KeyError) + self.check_result("loc", "f", "loc", "f", typs=["floats"], fails=KeyError) self.check_result( - "label range", "loc", "f", "ix", "f", typs=["floats"], fails=KeyError - ) - self.check_result( - "label range", - "loc", - 20, - "ix", - 20, - typs=["ints", "uints", "mixed"], - fails=KeyError, - ) - self.check_result( - "label range", "loc", 20, "ix", 20, typs=["labels"], fails=TypeError - ) - self.check_result( - "label range", "loc", 20, "ix", 20, typs=["ts"], axes=0, fails=TypeError - ) - self.check_result( - "label range", "loc", 20, "ix", 20, typs=["floats"], axes=0, fails=KeyError + "loc", 20, "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, ) + self.check_result("loc", 20, "loc", 20, typs=["labels"], fails=TypeError) + self.check_result("loc", 20, "loc", 20, typs=["ts"], axes=0, fails=TypeError) + self.check_result("loc", 20, "loc", 20, typs=["floats"], axes=0, fails=KeyError) def test_loc_getitem_label_list(self): - + # TODO: test something here? # list of labels + pass + + def test_loc_getitem_label_list_with_missing(self): self.check_result( - "list lbl", - "loc", - [0, 2, 4], - "ix", - [0, 2, 4], - typs=["ints", "uints"], - axes=0, - ) - self.check_result( - "list lbl", - "loc", - [3, 6, 9], - "ix", - [3, 6, 9], - typs=["ints", "uints"], - axes=1, + "loc", [0, 1, 2], "loc", [0, 1, 2], typs=["empty"], fails=KeyError, ) self.check_result( - "list lbl", "loc", - ["a", "b", "d"], + [0, 2, 10], "ix", - ["a", "b", "d"], - typs=["labels"], + [0, 2, 10], + typs=["ints", "uints", "floats"], axes=0, + fails=KeyError, ) + self.check_result( - "list lbl", "loc", - ["A", "B", "C"], + [3, 6, 7], "ix", - ["A", "B", "C"], - typs=["labels"], + [3, 6, 7], + typs=["ints", "uints", "floats"], axes=1, + fails=KeyError, ) + + # GH 17758 - MultiIndex and missing keys self.check_result( - "list lbl", - "loc", - [2, 8, "null"], - "ix", - [2, 8, "null"], - typs=["mixed"], - axes=0, - ) - self.check_result( - "list lbl", "loc", - [Timestamp("20130102"), Timestamp("20130103")], + [(1, 3), (1, 4), (2, 5)], "ix", - [Timestamp("20130102"), Timestamp("20130103")], - typs=["ts"], + [(1, 3), (1, 4), (2, 5)], + typs=["multi"], axes=0, - ) - - def test_loc_getitem_label_list_with_missing(self): - self.check_result( - "list lbl", - "loc", - [0, 1, 2], - "indexer", - [0, 1, 2], - typs=["empty"], fails=KeyError, ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result( - "list lbl", - "loc", - [0, 2, 10], - "ix", - [0, 2, 10], - typs=["ints", "uints", "floats"], - axes=0, - fails=KeyError, - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result( - "list lbl", - "loc", - [3, 6, 7], - "ix", - [3, 6, 7], - typs=["ints", "uints", "floats"], - axes=1, - fails=KeyError, - ) - - # GH 17758 - MultiIndex and missing keys - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result( - "list lbl", - "loc", - [(1, 3), (1, 4), (2, 5)], - "ix", - [(1, 3), (1, 4), (2, 5)], - typs=["multi"], - axes=0, - ) def test_getitem_label_list_with_missing(self): s = Series(range(3), index=["a", "b", "c"]) # consistency - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="with any missing labels"): s[["a", "d"]] s = Series(range(3)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="with any missing labels"): s[[0, 3]] def test_loc_getitem_label_list_fails(self): # fails self.check_result( - "list lbl", "loc", [20, 30, 40], - "ix", + "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, @@ -282,38 +186,15 @@ def test_loc_getitem_label_list_fails(self): ) def test_loc_getitem_label_array_like(self): + # TODO: test something? # array like - self.check_result( - "array like", - "loc", - Series(index=[0, 2, 4]).index, - "ix", - [0, 2, 4], - typs=["ints", "uints"], - axes=0, - ) - self.check_result( - "array like", - "loc", - Series(index=[3, 6, 9]).index, - "ix", - [3, 6, 9], - typs=["ints", "uints"], - axes=1, - ) + pass def test_loc_getitem_bool(self): # boolean indexers b = [True, False, True, False] - self.check_result( - "bool", - "loc", - b, - "ix", - b, - typs=["ints", "uints", "labels", "mixed", "ts", "floats"], - ) - self.check_result("bool", "loc", b, "ix", b, typs=["empty"], fails=IndexError) + + self.check_result("loc", b, "loc", b, typs=["empty"], fails=IndexError) @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_loc_getitem_bool_diff_len(self, index): @@ -326,26 +207,8 @@ def test_loc_getitem_bool_diff_len(self, index): _ = s.loc[index] def test_loc_getitem_int_slice(self): - - # ok - self.check_result( - "int slice2", - "loc", - slice(2, 4), - "ix", - [2, 4], - typs=["ints", "uints"], - axes=0, - ) - self.check_result( - "int slice2", - "loc", - slice(3, 6), - "ix", - [3, 6], - typs=["ints", "uints"], - axes=1, - ) + # TODO: test something here? + pass def test_loc_to_fail(self): @@ -365,7 +228,7 @@ def test_loc_to_fail(self): # GH 7496 # loc should not fallback - s = Series() + s = Series(dtype=object) s.loc[1] = 1 s.loc["a"] = 2 @@ -379,17 +242,13 @@ def test_loc_to_fail(self): with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] - msg = ( - r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" r" in the \[index\]\"" - ) + msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): s.loc[["4"]] s.loc[-1] = 3 - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[-1, -2]] - expected = Series([3, np.nan], index=[-1, -2]) - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[-1, -2]] s["a"] = 2 msg = ( @@ -435,90 +294,49 @@ def test_loc_getitem_list_with_fail(self): s.loc[[3]] # a non-match and a match - with tm.assert_produces_warning(FutureWarning): - expected = s.loc[[2, 3]] - result = s.reindex([2, 3]) - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[2, 3]] def test_loc_getitem_label_slice(self): # label slices (with ints) + + # real label slices + + # GH 14316 + self.check_result( - "lab slice", "loc", slice(1, 3), - "ix", + "loc", slice(1, 3), typs=["labels", "mixed", "empty", "ts", "floats"], fails=TypeError, ) - # real label slices self.check_result( - "lab slice", - "loc", - slice("a", "c"), - "ix", - slice("a", "c"), - typs=["labels"], - axes=0, - ) - self.check_result( - "lab slice", - "loc", - slice("A", "C"), - "ix", - slice("A", "C"), - typs=["labels"], - axes=1, - ) - - self.check_result( - "ts slice", "loc", slice("20130102", "20130104"), - "ix", - slice("20130102", "20130104"), - typs=["ts"], - axes=0, - ) - self.check_result( - "ts slice", "loc", slice("20130102", "20130104"), - "ix", - slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError, ) - # GH 14316 - self.check_result( - "ts slice rev", - "loc", - slice("20130104", "20130102"), - "indexer", - [0, 1, 2], - typs=["ts_rev"], - axes=0, - ) - self.check_result( - "mixed slice", "loc", slice(2, 8), - "ix", + "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError, ) self.check_result( - "mixed slice", "loc", slice(2, 8), - "ix", + "loc", slice(2, 8), typs=["mixed"], axes=1, @@ -526,10 +344,9 @@ def test_loc_getitem_label_slice(self): ) self.check_result( - "mixed slice", "loc", slice(2, 4, 2), - "ix", + "loc", slice(2, 4, 2), typs=["mixed"], axes=0, @@ -554,6 +371,9 @@ def test_loc_index(self): result = df.loc[mask.values] tm.assert_frame_equal(result, expected) + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( @@ -900,13 +720,13 @@ def test_setitem_new_key_tz(self): ] expected = pd.Series(vals, index=["foo", "bar"]) - ser = pd.Series() + ser = pd.Series(dtype=object) ser["foo"] = vals[0] ser["bar"] = vals[1] tm.assert_series_equal(ser, expected) - ser = pd.Series() + ser = pd.Series(dtype=object) ser.loc["foo"] = vals[0] ser.loc["bar"] = vals[1] @@ -1004,11 +824,6 @@ def test_loc_name(self): result = df.iloc[[0, 1]].index.name assert result == "index_name" - with catch_warnings(record=True): - filterwarnings("ignore", "\\n.ix", FutureWarning) - result = df.ix[[0, 1]].index.name - assert result == "index_name" - result = df.loc[[0, 1]].index.name assert result == "index_name" @@ -1122,7 +937,7 @@ def test_loc_reverse_assignment(self): data = [1, 2, 3, 4, 5, 6] + [None] * 4 expected = Series(data, index=range(2010, 2020)) - result = pd.Series(index=range(2010, 2020)) + result = pd.Series(index=range(2010, 2020), dtype=np.float64) result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] tm.assert_series_equal(result, expected) @@ -1134,10 +949,8 @@ def test_series_loc_getitem_label_list_missing_values(): ["2001-01-04", "2001-01-02", "2001-01-04", "2001-01-14"], dtype="datetime64" ) s = Series([2, 5, 8, 11], date_range("2001-01-01", freq="D", periods=4)) - expected = Series([11.0, 5.0, 11.0, np.nan], index=key) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[key] - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[key] @pytest.mark.parametrize( @@ -1156,3 +969,36 @@ def test_loc_getitem_label_list_integer_labels( expected = df.iloc[:, expected_columns] result = df.loc[["A", "B", "C"], column_key] tm.assert_frame_equal(result, expected, check_column_type=check_column_type) + + +def test_loc_setitem_float_intindex(): + # GH 8720 + rand_data = np.random.randn(8, 4) + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) + expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + tm.assert_frame_equal(result, expected) + + +def test_loc_axis_1_slice(): + # GH 10586 + cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]] + df = pd.DataFrame( + np.ones((10, 8)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples(cols), + ) + result = df.loc(axis=1)[(2014, 9):(2015, 8)] + expected = pd.DataFrame( + np.ones((10, 4)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples( + [(2014, 9), (2014, 10), (2015, 7), (2015, 8)] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..befe4fee8ecf8 --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 0fb71bfea76c0..5fda759020f1a 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -1,21 +1,18 @@ """ test setting *parts* of objects both positionally and label based -TOD: these should be split among the indexer tests +TODO: these should be split among the indexer tests """ -from warnings import catch_warnings - import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPartialSetting: - @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") def test_partial_setting(self): # GH2578, allow ix and friends to partially set @@ -87,32 +84,28 @@ def test_partial_setting(self): # single dtype frame, overwrite expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]})) df = df_orig.copy() - with catch_warnings(record=True): - df.ix[:, "B"] = df.ix[:, "A"] + df.loc[:, "B"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) df = df_orig.copy() df["B"] = df["B"].astype(np.float64) - with catch_warnings(record=True): - df.ix[:, "B"] = df.ix[:, "A"] + df.loc[:, "B"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected["C"] = df["A"] df = df_orig.copy() - with catch_warnings(record=True): - df.ix[:, "C"] = df.ix[:, "A"] + df.loc[:, "C"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected["C"] = df["A"] df = df_orig.copy() - with catch_warnings(record=True): - df.ix[:, "C"] = df.ix[:, "A"] + df.loc[:, "C"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # GH 8473 @@ -186,17 +179,15 @@ def test_series_partial_set(self): # loc equiv to .reindex expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="with any missing labels"): result = ser.loc[[3, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) result = ser.reindex([3, 2, 3]) tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, "x"]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="with any missing labels"): result = ser.loc[[3, 2, 3, "x"]] - tm.assert_series_equal(result, expected, check_index_type=True) result = ser.reindex([3, 2, 3, "x"]) tm.assert_series_equal(result, expected, check_index_type=True) @@ -206,9 +197,8 @@ def test_series_partial_set(self): tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, "x", 1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="with any missing labels"): result = ser.loc[[2, 2, "x", 1]] - tm.assert_series_equal(result, expected, check_index_type=True) result = ser.reindex([2, 2, "x", 1]) tm.assert_series_equal(result, expected, check_index_type=True) @@ -222,54 +212,48 @@ def test_series_partial_set(self): ser.loc[[3, 3, 3]] expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, 3]] result = ser.reindex([2, 2, 3]) tm.assert_series_equal(result, expected, check_index_type=True) s = Series([0.1, 0.2, 0.3], index=[1, 2, 3]) expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[3, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[3, 4, 4]] result = s.reindex([3, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[5, 3, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[5, 3, 3]] result = s.reindex([5, 3, 3]) tm.assert_series_equal(result, expected, check_index_type=True) s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[5, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[5, 4, 4]] result = s.reindex([5, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) s = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[7, 2, 2]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[7, 2, 2]] result = s.reindex([7, 2, 2]) tm.assert_series_equal(result, expected, check_index_type=True) s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s.loc[[4, 5, 5]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[[4, 5, 5]] result = s.reindex([4, 5, 5]) tm.assert_series_equal(result, expected, check_index_type=True) @@ -286,28 +270,19 @@ def test_series_partial_set_with_name(self): ser = Series([0.1, 0.2], index=idx, name="s") # loc - exp_idx = Index([3, 2, 3], dtype="int64", name="idx") - expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name="s") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[3, 2, 3]] - exp_idx = Index([3, 2, 3, "x"], dtype="object", name="idx") - expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name="s") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3, "x"]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[3, 2, 3, "x"]] exp_idx = Index([2, 2, 1], dtype="int64", name="idx") expected = Series([0.2, 0.2, 0.1], index=exp_idx, name="s") result = ser.loc[[2, 2, 1]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([2, 2, "x", 1], dtype="object", name="idx") - expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name="s") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, "x", 1]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, "x", 1]] # raises as nothing in in the index msg = ( @@ -317,46 +292,28 @@ def test_series_partial_set_with_name(self): with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] - exp_idx = Index([2, 2, 3], dtype="int64", name="idx") - expected = Series([0.2, 0.2, np.nan], index=exp_idx, name="s") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + ser.loc[[2, 2, 3]] - exp_idx = Index([3, 4, 4], dtype="int64", name="idx") - expected = Series([0.3, np.nan, np.nan], index=exp_idx, name="s") idx = Index([1, 2, 3], dtype="int64", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3], index=idx, name="s").loc[[3, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3], index=idx, name="s").loc[[3, 4, 4]] - exp_idx = Index([5, 3, 3], dtype="int64", name="idx") - expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name="s") idx = Index([1, 2, 3, 4], dtype="int64", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 3, 3]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 3, 3]] - exp_idx = Index([5, 4, 4], dtype="int64", name="idx") - expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name="s") idx = Index([1, 2, 3, 4], dtype="int64", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 4, 4]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 4, 4]] - exp_idx = Index([7, 2, 2], dtype="int64", name="idx") - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") idx = Index([4, 5, 6, 7], dtype="int64", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[7, 2, 2]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[7, 2, 2]] - exp_idx = Index([4, 5, 5], dtype="int64", name="idx") - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") idx = Index([1, 2, 3, 4], dtype="int64", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[4, 5, 5]] - tm.assert_series_equal(result, expected, check_index_type=True) + with pytest.raises(KeyError, match="with any missing labels"): + Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[4, 5, 5]] # iloc exp_idx = Index([2, 2, 1, 1], dtype="int64", name="idx") @@ -364,7 +321,6 @@ def test_series_partial_set_with_name(self): result = ser.iloc[[1, 1, 0, 0]] tm.assert_series_equal(result, expected, check_index_type=True) - @pytest.mark.filterwarnings("ignore:\\n.ix") def test_partial_set_invalid(self): # GH 4940 @@ -375,26 +331,15 @@ def test_partial_set_invalid(self): # don't allow not string inserts with pytest.raises(TypeError): - with catch_warnings(record=True): - df.loc[100.0, :] = df.ix[0] - - with pytest.raises(TypeError): - with catch_warnings(record=True): - df.loc[100, :] = df.ix[0] + df.loc[100.0, :] = df.iloc[0] with pytest.raises(TypeError): - with catch_warnings(record=True): - df.ix[100.0, :] = df.ix[0] - - with pytest.raises(ValueError): - with catch_warnings(record=True): - df.ix[100, :] = df.ix[0] + df.loc[100, :] = df.iloc[0] # allow object conversion here df = orig.copy() - with catch_warnings(record=True): - df.loc["a", :] = df.ix[0] - exp = orig.append(Series(df.ix[0], name="a")) + df.loc["a", :] = df.iloc[0] + exp = orig.append(Series(df.iloc[0], name="a")) tm.assert_frame_equal(df, exp) tm.assert_index_equal(df.index, Index(orig.index.tolist() + ["a"])) assert df.index.dtype == "object" @@ -404,19 +349,19 @@ def test_partial_set_empty_series(self): # GH5226 # partially set with an empty object series - s = Series() + s = Series(dtype=object) s.loc[1] = 1 tm.assert_series_equal(s, Series([1], index=[1])) s.loc[3] = 3 tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) - s = Series() + s = Series(dtype=object) s.loc[1] = 1.0 tm.assert_series_equal(s, Series([1.0], index=[1])) s.loc[3] = 3.0 tm.assert_series_equal(s, Series([1.0, 3.0], index=[1, 3])) - s = Series() + s = Series(dtype=object) s.loc["foo"] = 1 tm.assert_series_equal(s, Series([1], index=["foo"])) s.loc["bar"] = 3 @@ -548,11 +493,11 @@ def test_partial_set_empty_frame_row(self): def test_partial_set_empty_frame_set_series(self): # GH 5756 # setting with empty Series - df = DataFrame(Series()) - tm.assert_frame_equal(df, DataFrame({0: Series()})) + df = DataFrame(Series(dtype=object)) + tm.assert_frame_equal(df, DataFrame({0: Series(dtype=object)})) - df = DataFrame(Series(name="foo")) - tm.assert_frame_equal(df, DataFrame({"foo": Series()})) + df = DataFrame(Series(name="foo", dtype=object)) + tm.assert_frame_equal(df, DataFrame({"foo": Series(dtype=object)})) def test_partial_set_empty_frame_empty_copy_assignment(self): # GH 5932 diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index b41b90cd9afd1..a567fb9b8ccc7 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series, Timedelta, Timestamp, date_range +import pandas._testing as tm from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestScalar(Base): @@ -16,7 +16,7 @@ def _check(f, func, values=False): indicies = self.generate_indices(f, values) for i in indicies: result = getattr(f, func)[i] - expected = self.get_value(f, i, values) + expected = self.get_value(func, f, i, values) tm.assert_almost_equal(result, expected) for kind in self._kinds: @@ -44,7 +44,7 @@ def _check(f, func, values=False): indicies = self.generate_indices(f, values) for i in indicies: getattr(f, func)[i] = 1 - expected = self.get_value(f, i, values) + expected = self.get_value(func, f, i, values) tm.assert_almost_equal(expected, 1) for kind in self._kinds: @@ -132,8 +132,8 @@ def test_at_to_fail(self): result = s.at["a"] assert result == 1 msg = ( - "At based indexing on an non-integer index can only have" - " non-integer indexers" + "At based indexing on an non-integer index can only have " + "non-integer indexers" ) with pytest.raises(ValueError, match=msg): s.at[0] diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 5c9865ddc7090..dd4750123c0b5 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndexing: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 16f14f35fdbae..15b1434f8629f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,10 +1,8 @@ from collections import OrderedDict from datetime import date, datetime -from distutils.version import LooseVersion import itertools import operator import re -import sys import numpy as np import pytest @@ -12,22 +10,11 @@ from pandas._libs.internals import BlockPlacement import pandas as pd -from pandas import ( - Categorical, - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - SparseArray, -) +from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray, TimedeltaArray +from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray from pandas.core.internals import BlockManager, SingleBlockManager, make_block -import pandas.util.testing as tm - -# in 3.6.1 a c-api slicing function changed, see src/compat_helper.h -PY361 = LooseVersion(sys.version) >= LooseVersion("3.6.1") @pytest.fixture @@ -139,7 +126,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): arr = values.sp_values.view() arr += num_offset - 1 else: - raise ValueError('Unsupported typestr: "%s"' % typestr) + raise ValueError(f'Unsupported typestr: "{typestr}"') return make_block(values, placement=placement, ndim=len(shape)) @@ -313,12 +300,6 @@ def test_delete(self): with pytest.raises(Exception): newb.delete(3) - def test_make_block_same_class(self): - # issue 19431 - block = create_block("M8[ns, US/Eastern]", [3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - block.make_block_same_class(block.values, dtype=block.values.dtype) - class TestDatetimeBlock: def test_can_hold_element(self): @@ -1096,10 +1077,6 @@ def assert_as_slice_equals(arr, slc): assert_as_slice_equals([2, 1], slice(2, 0, -1)) - if not PY361: - assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) - assert_as_slice_equals([100, 0], slice(100, None, -100)) - def test_not_slice_like_arrays(self): def assert_not_slice_like(arr): assert not BlockPlacement(arr).is_slice_like @@ -1119,10 +1096,6 @@ def test_slice_iter(self): assert list(BlockPlacement(slice(0, 0))) == [] assert list(BlockPlacement(slice(3, 0))) == [] - if not PY361: - assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1] - assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0] - def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): tm.assert_numpy_array_equal( @@ -1135,10 +1108,6 @@ def assert_as_array_equals(slc, asarray): assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) - if not PY361: - assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) - assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) - def test_blockplacement_add(self): bpl = BlockPlacement(slice(0, 5)) assert bpl.add(1).as_slice == slice(1, 6, 1) @@ -1168,14 +1137,6 @@ def assert_add_equals(val, inc, result): with pytest.raises(ValueError): BlockPlacement([1, 2, 4]).add(-10) - if not PY361: - assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) - assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) - - with pytest.raises(ValueError): - BlockPlacement(slice(2, None, -1)).add(-1) - class DummyElement: def __init__(self, value, dtype): @@ -1185,10 +1146,10 @@ def __init__(self, value, dtype): def __array__(self): return np.array(self.value, dtype=self.dtype) - def __str__(self): + def __str__(self) -> str: return "DummyElement({}, {})".format(self.value, self.dtype) - def __repr__(self): + def __repr__(self) -> str: return str(self) def astype(self, dtype, copy=False): @@ -1280,13 +1241,6 @@ def test_holder(typestr, holder): assert blk._holder is holder -def test_deprecated_fastpath(): - # GH#19265 - values = np.random.rand(3, 3) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - make_block(values, placement=np.arange(3), fastpath=True) - - def test_validate_ndim(): values = np.array([1.0, 2.0]) placement = slice(2) @@ -1306,7 +1260,7 @@ def test_block_shape(): def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 - arr = pd.array([1, 2]) + arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) @@ -1322,3 +1276,10 @@ def test_make_block_no_pandas_array(): result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype) assert result.is_integer is True assert result.is_extension is False + + +def test_dataframe_not_equal(): + # see GH28839 + df1 = pd.DataFrame({"a": [1, 2], "b": ["s", "d"]}) + df2 = pd.DataFrame({"a": ["s", "d"], "b": [1, 2]}) + assert df1.equals(df2) is False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 7b6b9b6380a36..7810778602e12 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -2,7 +2,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv @@ -40,7 +40,7 @@ def s3_resource(tips_file, jsonl_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - pytest.importorskip("s3fs") + s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") with tm.ensure_safe_environment_variables(): @@ -77,6 +77,7 @@ def add_tips_files(bucket_name): conn.create_bucket(Bucket="cant_get_it", ACL="private") add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() yield conn finally: s3.stop() diff --git a/pandas/tests/io/data/banklist.csv b/pandas/tests/io/data/csv/banklist.csv similarity index 100% rename from pandas/tests/io/data/banklist.csv rename to pandas/tests/io/data/csv/banklist.csv diff --git a/pandas/tests/io/data/iris.csv b/pandas/tests/io/data/csv/iris.csv similarity index 100% rename from pandas/tests/io/data/iris.csv rename to pandas/tests/io/data/csv/iris.csv diff --git a/pandas/tests/io/data/test1.csv b/pandas/tests/io/data/csv/test1.csv similarity index 100% rename from pandas/tests/io/data/test1.csv rename to pandas/tests/io/data/csv/test1.csv diff --git a/pandas/tests/io/data/test_mmap.csv b/pandas/tests/io/data/csv/test_mmap.csv similarity index 100% rename from pandas/tests/io/data/test_mmap.csv rename to pandas/tests/io/data/csv/test_mmap.csv diff --git a/pandas/tests/io/data/tips.csv b/pandas/tests/io/data/csv/tips.csv similarity index 100% rename from pandas/tests/io/data/tips.csv rename to pandas/tests/io/data/csv/tips.csv diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/excel/blank.ods similarity index 100% rename from pandas/tests/io/data/blank.ods rename to pandas/tests/io/data/excel/blank.ods diff --git a/pandas/tests/io/data/blank.xls b/pandas/tests/io/data/excel/blank.xls similarity index 100% rename from pandas/tests/io/data/blank.xls rename to pandas/tests/io/data/excel/blank.xls diff --git a/pandas/tests/io/data/blank.xlsm b/pandas/tests/io/data/excel/blank.xlsm similarity index 100% rename from pandas/tests/io/data/blank.xlsm rename to pandas/tests/io/data/excel/blank.xlsm diff --git a/pandas/tests/io/data/blank.xlsx b/pandas/tests/io/data/excel/blank.xlsx similarity index 100% rename from pandas/tests/io/data/blank.xlsx rename to pandas/tests/io/data/excel/blank.xlsx diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/excel/blank_with_header.ods similarity index 100% rename from pandas/tests/io/data/blank_with_header.ods rename to pandas/tests/io/data/excel/blank_with_header.ods diff --git a/pandas/tests/io/data/blank_with_header.xls b/pandas/tests/io/data/excel/blank_with_header.xls similarity index 100% rename from pandas/tests/io/data/blank_with_header.xls rename to pandas/tests/io/data/excel/blank_with_header.xls diff --git a/pandas/tests/io/data/blank_with_header.xlsm b/pandas/tests/io/data/excel/blank_with_header.xlsm similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsm rename to pandas/tests/io/data/excel/blank_with_header.xlsm diff --git a/pandas/tests/io/data/blank_with_header.xlsx b/pandas/tests/io/data/excel/blank_with_header.xlsx similarity index 100% rename from pandas/tests/io/data/blank_with_header.xlsx rename to pandas/tests/io/data/excel/blank_with_header.xlsx diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/excel/invalid_value_type.ods similarity index 100% rename from pandas/tests/io/data/invalid_value_type.ods rename to pandas/tests/io/data/excel/invalid_value_type.ods diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/excel/test1.ods similarity index 100% rename from pandas/tests/io/data/test1.ods rename to pandas/tests/io/data/excel/test1.ods diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/excel/test1.xls similarity index 100% rename from pandas/tests/io/data/test1.xls rename to pandas/tests/io/data/excel/test1.xls diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/excel/test1.xlsm similarity index 100% rename from pandas/tests/io/data/test1.xlsm rename to pandas/tests/io/data/excel/test1.xlsm diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/excel/test1.xlsx similarity index 100% rename from pandas/tests/io/data/test1.xlsx rename to pandas/tests/io/data/excel/test1.xlsx diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/excel/test2.ods similarity index 100% rename from pandas/tests/io/data/test2.ods rename to pandas/tests/io/data/excel/test2.ods diff --git a/pandas/tests/io/data/test2.xls b/pandas/tests/io/data/excel/test2.xls similarity index 100% rename from pandas/tests/io/data/test2.xls rename to pandas/tests/io/data/excel/test2.xls diff --git a/pandas/tests/io/data/test2.xlsm b/pandas/tests/io/data/excel/test2.xlsm similarity index 100% rename from pandas/tests/io/data/test2.xlsm rename to pandas/tests/io/data/excel/test2.xlsm diff --git a/pandas/tests/io/data/test2.xlsx b/pandas/tests/io/data/excel/test2.xlsx similarity index 100% rename from pandas/tests/io/data/test2.xlsx rename to pandas/tests/io/data/excel/test2.xlsx diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/excel/test3.ods similarity index 100% rename from pandas/tests/io/data/test3.ods rename to pandas/tests/io/data/excel/test3.ods diff --git a/pandas/tests/io/data/test3.xls b/pandas/tests/io/data/excel/test3.xls similarity index 100% rename from pandas/tests/io/data/test3.xls rename to pandas/tests/io/data/excel/test3.xls diff --git a/pandas/tests/io/data/test3.xlsm b/pandas/tests/io/data/excel/test3.xlsm similarity index 100% rename from pandas/tests/io/data/test3.xlsm rename to pandas/tests/io/data/excel/test3.xlsm diff --git a/pandas/tests/io/data/test3.xlsx b/pandas/tests/io/data/excel/test3.xlsx similarity index 100% rename from pandas/tests/io/data/test3.xlsx rename to pandas/tests/io/data/excel/test3.xlsx diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/excel/test4.ods similarity index 100% rename from pandas/tests/io/data/test4.ods rename to pandas/tests/io/data/excel/test4.ods diff --git a/pandas/tests/io/data/test4.xls b/pandas/tests/io/data/excel/test4.xls similarity index 100% rename from pandas/tests/io/data/test4.xls rename to pandas/tests/io/data/excel/test4.xls diff --git a/pandas/tests/io/data/test4.xlsm b/pandas/tests/io/data/excel/test4.xlsm similarity index 100% rename from pandas/tests/io/data/test4.xlsm rename to pandas/tests/io/data/excel/test4.xlsm diff --git a/pandas/tests/io/data/test4.xlsx b/pandas/tests/io/data/excel/test4.xlsx similarity index 100% rename from pandas/tests/io/data/test4.xlsx rename to pandas/tests/io/data/excel/test4.xlsx diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/excel/test5.ods similarity index 100% rename from pandas/tests/io/data/test5.ods rename to pandas/tests/io/data/excel/test5.ods diff --git a/pandas/tests/io/data/test5.xls b/pandas/tests/io/data/excel/test5.xls similarity index 100% rename from pandas/tests/io/data/test5.xls rename to pandas/tests/io/data/excel/test5.xls diff --git a/pandas/tests/io/data/test5.xlsm b/pandas/tests/io/data/excel/test5.xlsm similarity index 100% rename from pandas/tests/io/data/test5.xlsm rename to pandas/tests/io/data/excel/test5.xlsm diff --git a/pandas/tests/io/data/test5.xlsx b/pandas/tests/io/data/excel/test5.xlsx similarity index 100% rename from pandas/tests/io/data/test5.xlsx rename to pandas/tests/io/data/excel/test5.xlsx diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/excel/test_converters.ods similarity index 100% rename from pandas/tests/io/data/test_converters.ods rename to pandas/tests/io/data/excel/test_converters.ods diff --git a/pandas/tests/io/data/test_converters.xls b/pandas/tests/io/data/excel/test_converters.xls similarity index 100% rename from pandas/tests/io/data/test_converters.xls rename to pandas/tests/io/data/excel/test_converters.xls diff --git a/pandas/tests/io/data/test_converters.xlsm b/pandas/tests/io/data/excel/test_converters.xlsm similarity index 100% rename from pandas/tests/io/data/test_converters.xlsm rename to pandas/tests/io/data/excel/test_converters.xlsm diff --git a/pandas/tests/io/data/test_converters.xlsx b/pandas/tests/io/data/excel/test_converters.xlsx similarity index 100% rename from pandas/tests/io/data/test_converters.xlsx rename to pandas/tests/io/data/excel/test_converters.xlsx diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/excel/test_index_name_pre17.ods similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.ods rename to pandas/tests/io/data/excel/test_index_name_pre17.ods diff --git a/pandas/tests/io/data/test_index_name_pre17.xls b/pandas/tests/io/data/excel/test_index_name_pre17.xls similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xls rename to pandas/tests/io/data/excel/test_index_name_pre17.xls diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsm b/pandas/tests/io/data/excel/test_index_name_pre17.xlsm similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsm rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsm diff --git a/pandas/tests/io/data/test_index_name_pre17.xlsx b/pandas/tests/io/data/excel/test_index_name_pre17.xlsx similarity index 100% rename from pandas/tests/io/data/test_index_name_pre17.xlsx rename to pandas/tests/io/data/excel/test_index_name_pre17.xlsx diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/excel/test_multisheet.ods similarity index 100% rename from pandas/tests/io/data/test_multisheet.ods rename to pandas/tests/io/data/excel/test_multisheet.ods diff --git a/pandas/tests/io/data/test_multisheet.xls b/pandas/tests/io/data/excel/test_multisheet.xls similarity index 100% rename from pandas/tests/io/data/test_multisheet.xls rename to pandas/tests/io/data/excel/test_multisheet.xls diff --git a/pandas/tests/io/data/test_multisheet.xlsm b/pandas/tests/io/data/excel/test_multisheet.xlsm similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsm rename to pandas/tests/io/data/excel/test_multisheet.xlsm diff --git a/pandas/tests/io/data/test_multisheet.xlsx b/pandas/tests/io/data/excel/test_multisheet.xlsx similarity index 100% rename from pandas/tests/io/data/test_multisheet.xlsx rename to pandas/tests/io/data/excel/test_multisheet.xlsx diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/excel/test_squeeze.ods similarity index 100% rename from pandas/tests/io/data/test_squeeze.ods rename to pandas/tests/io/data/excel/test_squeeze.ods diff --git a/pandas/tests/io/data/test_squeeze.xls b/pandas/tests/io/data/excel/test_squeeze.xls similarity index 100% rename from pandas/tests/io/data/test_squeeze.xls rename to pandas/tests/io/data/excel/test_squeeze.xls diff --git a/pandas/tests/io/data/test_squeeze.xlsm b/pandas/tests/io/data/excel/test_squeeze.xlsm similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsm rename to pandas/tests/io/data/excel/test_squeeze.xlsm diff --git a/pandas/tests/io/data/test_squeeze.xlsx b/pandas/tests/io/data/excel/test_squeeze.xlsx similarity index 100% rename from pandas/tests/io/data/test_squeeze.xlsx rename to pandas/tests/io/data/excel/test_squeeze.xlsx diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/excel/test_types.ods similarity index 100% rename from pandas/tests/io/data/test_types.ods rename to pandas/tests/io/data/excel/test_types.ods diff --git a/pandas/tests/io/data/test_types.xls b/pandas/tests/io/data/excel/test_types.xls similarity index 100% rename from pandas/tests/io/data/test_types.xls rename to pandas/tests/io/data/excel/test_types.xls diff --git a/pandas/tests/io/data/test_types.xlsm b/pandas/tests/io/data/excel/test_types.xlsm similarity index 100% rename from pandas/tests/io/data/test_types.xlsm rename to pandas/tests/io/data/excel/test_types.xlsm diff --git a/pandas/tests/io/data/test_types.xlsx b/pandas/tests/io/data/excel/test_types.xlsx similarity index 100% rename from pandas/tests/io/data/test_types.xlsx rename to pandas/tests/io/data/excel/test_types.xlsx diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/excel/testdateoverflow.ods similarity index 100% rename from pandas/tests/io/data/testdateoverflow.ods rename to pandas/tests/io/data/excel/testdateoverflow.ods diff --git a/pandas/tests/io/data/testdateoverflow.xls b/pandas/tests/io/data/excel/testdateoverflow.xls similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xls rename to pandas/tests/io/data/excel/testdateoverflow.xls diff --git a/pandas/tests/io/data/testdateoverflow.xlsm b/pandas/tests/io/data/excel/testdateoverflow.xlsm similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsm rename to pandas/tests/io/data/excel/testdateoverflow.xlsm diff --git a/pandas/tests/io/data/testdateoverflow.xlsx b/pandas/tests/io/data/excel/testdateoverflow.xlsx similarity index 100% rename from pandas/tests/io/data/testdateoverflow.xlsx rename to pandas/tests/io/data/excel/testdateoverflow.xlsx diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/excel/testdtype.ods similarity index 100% rename from pandas/tests/io/data/testdtype.ods rename to pandas/tests/io/data/excel/testdtype.ods diff --git a/pandas/tests/io/data/testdtype.xls b/pandas/tests/io/data/excel/testdtype.xls similarity index 100% rename from pandas/tests/io/data/testdtype.xls rename to pandas/tests/io/data/excel/testdtype.xls diff --git a/pandas/tests/io/data/testdtype.xlsm b/pandas/tests/io/data/excel/testdtype.xlsm similarity index 100% rename from pandas/tests/io/data/testdtype.xlsm rename to pandas/tests/io/data/excel/testdtype.xlsm diff --git a/pandas/tests/io/data/testdtype.xlsx b/pandas/tests/io/data/excel/testdtype.xlsx similarity index 100% rename from pandas/tests/io/data/testdtype.xlsx rename to pandas/tests/io/data/excel/testdtype.xlsx diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/excel/testmultiindex.ods similarity index 100% rename from pandas/tests/io/data/testmultiindex.ods rename to pandas/tests/io/data/excel/testmultiindex.ods diff --git a/pandas/tests/io/data/testmultiindex.xls b/pandas/tests/io/data/excel/testmultiindex.xls similarity index 100% rename from pandas/tests/io/data/testmultiindex.xls rename to pandas/tests/io/data/excel/testmultiindex.xls diff --git a/pandas/tests/io/data/testmultiindex.xlsm b/pandas/tests/io/data/excel/testmultiindex.xlsm similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsm rename to pandas/tests/io/data/excel/testmultiindex.xlsm diff --git a/pandas/tests/io/data/testmultiindex.xlsx b/pandas/tests/io/data/excel/testmultiindex.xlsx similarity index 100% rename from pandas/tests/io/data/testmultiindex.xlsx rename to pandas/tests/io/data/excel/testmultiindex.xlsx diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/excel/testskiprows.ods similarity index 100% rename from pandas/tests/io/data/testskiprows.ods rename to pandas/tests/io/data/excel/testskiprows.ods diff --git a/pandas/tests/io/data/testskiprows.xls b/pandas/tests/io/data/excel/testskiprows.xls similarity index 100% rename from pandas/tests/io/data/testskiprows.xls rename to pandas/tests/io/data/excel/testskiprows.xls diff --git a/pandas/tests/io/data/testskiprows.xlsm b/pandas/tests/io/data/excel/testskiprows.xlsm similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsm rename to pandas/tests/io/data/excel/testskiprows.xlsm diff --git a/pandas/tests/io/data/testskiprows.xlsx b/pandas/tests/io/data/excel/testskiprows.xlsx similarity index 100% rename from pandas/tests/io/data/testskiprows.xlsx rename to pandas/tests/io/data/excel/testskiprows.xlsx diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/excel/times_1900.ods similarity index 100% rename from pandas/tests/io/data/times_1900.ods rename to pandas/tests/io/data/excel/times_1900.ods diff --git a/pandas/tests/io/data/times_1900.xls b/pandas/tests/io/data/excel/times_1900.xls similarity index 100% rename from pandas/tests/io/data/times_1900.xls rename to pandas/tests/io/data/excel/times_1900.xls diff --git a/pandas/tests/io/data/times_1900.xlsm b/pandas/tests/io/data/excel/times_1900.xlsm similarity index 100% rename from pandas/tests/io/data/times_1900.xlsm rename to pandas/tests/io/data/excel/times_1900.xlsm diff --git a/pandas/tests/io/data/times_1900.xlsx b/pandas/tests/io/data/excel/times_1900.xlsx similarity index 100% rename from pandas/tests/io/data/times_1900.xlsx rename to pandas/tests/io/data/excel/times_1900.xlsx diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/excel/times_1904.ods similarity index 100% rename from pandas/tests/io/data/times_1904.ods rename to pandas/tests/io/data/excel/times_1904.ods diff --git a/pandas/tests/io/data/times_1904.xls b/pandas/tests/io/data/excel/times_1904.xls similarity index 100% rename from pandas/tests/io/data/times_1904.xls rename to pandas/tests/io/data/excel/times_1904.xls diff --git a/pandas/tests/io/data/times_1904.xlsm b/pandas/tests/io/data/excel/times_1904.xlsm similarity index 100% rename from pandas/tests/io/data/times_1904.xlsm rename to pandas/tests/io/data/excel/times_1904.xlsm diff --git a/pandas/tests/io/data/times_1904.xlsx b/pandas/tests/io/data/excel/times_1904.xlsx similarity index 100% rename from pandas/tests/io/data/times_1904.xlsx rename to pandas/tests/io/data/excel/times_1904.xlsx diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/excel/writertable.odt similarity index 100% rename from pandas/tests/io/data/writertable.odt rename to pandas/tests/io/data/excel/writertable.odt diff --git a/pandas/tests/io/data/feather-0_3_1.feather b/pandas/tests/io/data/feather/feather-0_3_1.feather similarity index 100% rename from pandas/tests/io/data/feather-0_3_1.feather rename to pandas/tests/io/data/feather/feather-0_3_1.feather diff --git a/pandas/tests/io/data/fixed_width_format.txt b/pandas/tests/io/data/fixed_width/fixed_width_format.txt similarity index 100% rename from pandas/tests/io/data/fixed_width_format.txt rename to pandas/tests/io/data/fixed_width/fixed_width_format.txt diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/html/banklist.html similarity index 100% rename from pandas/tests/io/data/banklist.html rename to pandas/tests/io/data/html/banklist.html diff --git a/pandas/tests/io/data/computer_sales_page.html b/pandas/tests/io/data/html/computer_sales_page.html similarity index 100% rename from pandas/tests/io/data/computer_sales_page.html rename to pandas/tests/io/data/html/computer_sales_page.html diff --git a/pandas/tests/io/data/macau.html b/pandas/tests/io/data/html/macau.html similarity index 100% rename from pandas/tests/io/data/macau.html rename to pandas/tests/io/data/html/macau.html diff --git a/pandas/tests/io/data/nyse_wsj.html b/pandas/tests/io/data/html/nyse_wsj.html similarity index 100% rename from pandas/tests/io/data/nyse_wsj.html rename to pandas/tests/io/data/html/nyse_wsj.html diff --git a/pandas/tests/io/data/spam.html b/pandas/tests/io/data/html/spam.html similarity index 100% rename from pandas/tests/io/data/spam.html rename to pandas/tests/io/data/html/spam.html diff --git a/pandas/tests/io/data/valid_markup.html b/pandas/tests/io/data/html/valid_markup.html similarity index 100% rename from pandas/tests/io/data/valid_markup.html rename to pandas/tests/io/data/html/valid_markup.html diff --git a/pandas/tests/io/data/wikipedia_states.html b/pandas/tests/io/data/html/wikipedia_states.html similarity index 100% rename from pandas/tests/io/data/wikipedia_states.html rename to pandas/tests/io/data/html/wikipedia_states.html diff --git a/pandas/tests/io/data/orc/TestOrcFile.decimal.orc b/pandas/tests/io/data/orc/TestOrcFile.decimal.orc new file mode 100644 index 0000000000000..cb0f7b9d767a3 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.decimal.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc b/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc new file mode 100644 index 0000000000000..ecdadcbff1346 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.emptyFile.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.test1.orc b/pandas/tests/io/data/orc/TestOrcFile.test1.orc new file mode 100644 index 0000000000000..4fb0beff86897 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.test1.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc new file mode 100644 index 0000000000000..f51ffdbd03a43 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testDate1900.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc b/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc new file mode 100644 index 0000000000000..cd11fa8a4e91d Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testDate2038.orc differ diff --git a/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc b/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc new file mode 100644 index 0000000000000..aa6cc9c9ba1a7 Binary files /dev/null and b/pandas/tests/io/data/orc/TestOrcFile.testSnappy.orc differ diff --git a/pandas/tests/io/data/categorical.0.25.0.pickle b/pandas/tests/io/data/pickle/categorical.0.25.0.pickle similarity index 100% rename from pandas/tests/io/data/categorical.0.25.0.pickle rename to pandas/tests/io/data/pickle/categorical.0.25.0.pickle diff --git a/pandas/tests/io/data/sparseframe-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseframe-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/sparseseries-0.20.3.pickle.gz b/pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz similarity index 100% rename from pandas/tests/io/data/sparseseries-0.20.3.pickle.gz rename to pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz diff --git a/pandas/tests/io/data/pickle/test_py27.pkl b/pandas/tests/io/data/pickle/test_py27.pkl new file mode 100644 index 0000000000000..5308b864bc0c7 Binary files /dev/null and b/pandas/tests/io/data/pickle/test_py27.pkl differ diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/spss/labelled-num-na.sav similarity index 100% rename from pandas/tests/io/data/labelled-num-na.sav rename to pandas/tests/io/data/spss/labelled-num-na.sav diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/spss/labelled-num.sav similarity index 100% rename from pandas/tests/io/data/labelled-num.sav rename to pandas/tests/io/data/spss/labelled-num.sav diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/spss/labelled-str.sav similarity index 100% rename from pandas/tests/io/data/labelled-str.sav rename to pandas/tests/io/data/spss/labelled-str.sav diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/spss/umlauts.sav similarity index 100% rename from pandas/tests/io/data/umlauts.sav rename to pandas/tests/io/data/spss/umlauts.sav diff --git a/pandas/tests/io/data/S4_EDUC1.dta b/pandas/tests/io/data/stata/S4_EDUC1.dta similarity index 100% rename from pandas/tests/io/data/S4_EDUC1.dta rename to pandas/tests/io/data/stata/S4_EDUC1.dta diff --git a/pandas/tests/io/data/stata10_115.dta b/pandas/tests/io/data/stata/stata10_115.dta similarity index 100% rename from pandas/tests/io/data/stata10_115.dta rename to pandas/tests/io/data/stata/stata10_115.dta diff --git a/pandas/tests/io/data/stata10_117.dta b/pandas/tests/io/data/stata/stata10_117.dta similarity index 100% rename from pandas/tests/io/data/stata10_117.dta rename to pandas/tests/io/data/stata/stata10_117.dta diff --git a/pandas/tests/io/data/stata11_115.dta b/pandas/tests/io/data/stata/stata11_115.dta similarity index 100% rename from pandas/tests/io/data/stata11_115.dta rename to pandas/tests/io/data/stata/stata11_115.dta diff --git a/pandas/tests/io/data/stata11_117.dta b/pandas/tests/io/data/stata/stata11_117.dta similarity index 100% rename from pandas/tests/io/data/stata11_117.dta rename to pandas/tests/io/data/stata/stata11_117.dta diff --git a/pandas/tests/io/data/stata12_117.dta b/pandas/tests/io/data/stata/stata12_117.dta similarity index 100% rename from pandas/tests/io/data/stata12_117.dta rename to pandas/tests/io/data/stata/stata12_117.dta diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata/stata13_dates.dta similarity index 100% rename from pandas/tests/io/data/stata13_dates.dta rename to pandas/tests/io/data/stata/stata13_dates.dta diff --git a/pandas/tests/io/data/stata14_118.dta b/pandas/tests/io/data/stata/stata14_118.dta similarity index 100% rename from pandas/tests/io/data/stata14_118.dta rename to pandas/tests/io/data/stata/stata14_118.dta diff --git a/pandas/tests/io/data/stata15.dta b/pandas/tests/io/data/stata/stata15.dta similarity index 100% rename from pandas/tests/io/data/stata15.dta rename to pandas/tests/io/data/stata/stata15.dta diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata/stata16_118.dta similarity index 100% rename from pandas/tests/io/data/stata16_118.dta rename to pandas/tests/io/data/stata/stata16_118.dta diff --git a/pandas/tests/io/data/stata1_114.dta b/pandas/tests/io/data/stata/stata1_114.dta similarity index 100% rename from pandas/tests/io/data/stata1_114.dta rename to pandas/tests/io/data/stata/stata1_114.dta diff --git a/pandas/tests/io/data/stata1_117.dta b/pandas/tests/io/data/stata/stata1_117.dta similarity index 100% rename from pandas/tests/io/data/stata1_117.dta rename to pandas/tests/io/data/stata/stata1_117.dta diff --git a/pandas/tests/io/data/stata1_119.dta.gz b/pandas/tests/io/data/stata/stata1_119.dta.gz similarity index 100% rename from pandas/tests/io/data/stata1_119.dta.gz rename to pandas/tests/io/data/stata/stata1_119.dta.gz diff --git a/pandas/tests/io/data/stata1_encoding.dta b/pandas/tests/io/data/stata/stata1_encoding.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding.dta rename to pandas/tests/io/data/stata/stata1_encoding.dta diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata/stata1_encoding_118.dta similarity index 100% rename from pandas/tests/io/data/stata1_encoding_118.dta rename to pandas/tests/io/data/stata/stata1_encoding_118.dta diff --git a/pandas/tests/io/data/stata2_113.dta b/pandas/tests/io/data/stata/stata2_113.dta similarity index 100% rename from pandas/tests/io/data/stata2_113.dta rename to pandas/tests/io/data/stata/stata2_113.dta diff --git a/pandas/tests/io/data/stata2_114.dta b/pandas/tests/io/data/stata/stata2_114.dta similarity index 100% rename from pandas/tests/io/data/stata2_114.dta rename to pandas/tests/io/data/stata/stata2_114.dta diff --git a/pandas/tests/io/data/stata2_115.dta b/pandas/tests/io/data/stata/stata2_115.dta similarity index 100% rename from pandas/tests/io/data/stata2_115.dta rename to pandas/tests/io/data/stata/stata2_115.dta diff --git a/pandas/tests/io/data/stata2_117.dta b/pandas/tests/io/data/stata/stata2_117.dta similarity index 100% rename from pandas/tests/io/data/stata2_117.dta rename to pandas/tests/io/data/stata/stata2_117.dta diff --git a/pandas/tests/io/data/stata3.csv b/pandas/tests/io/data/stata/stata3.csv similarity index 100% rename from pandas/tests/io/data/stata3.csv rename to pandas/tests/io/data/stata/stata3.csv diff --git a/pandas/tests/io/data/stata3_113.dta b/pandas/tests/io/data/stata/stata3_113.dta similarity index 100% rename from pandas/tests/io/data/stata3_113.dta rename to pandas/tests/io/data/stata/stata3_113.dta diff --git a/pandas/tests/io/data/stata3_114.dta b/pandas/tests/io/data/stata/stata3_114.dta similarity index 100% rename from pandas/tests/io/data/stata3_114.dta rename to pandas/tests/io/data/stata/stata3_114.dta diff --git a/pandas/tests/io/data/stata3_115.dta b/pandas/tests/io/data/stata/stata3_115.dta similarity index 100% rename from pandas/tests/io/data/stata3_115.dta rename to pandas/tests/io/data/stata/stata3_115.dta diff --git a/pandas/tests/io/data/stata3_117.dta b/pandas/tests/io/data/stata/stata3_117.dta similarity index 100% rename from pandas/tests/io/data/stata3_117.dta rename to pandas/tests/io/data/stata/stata3_117.dta diff --git a/pandas/tests/io/data/stata4_113.dta b/pandas/tests/io/data/stata/stata4_113.dta similarity index 100% rename from pandas/tests/io/data/stata4_113.dta rename to pandas/tests/io/data/stata/stata4_113.dta diff --git a/pandas/tests/io/data/stata4_114.dta b/pandas/tests/io/data/stata/stata4_114.dta similarity index 100% rename from pandas/tests/io/data/stata4_114.dta rename to pandas/tests/io/data/stata/stata4_114.dta diff --git a/pandas/tests/io/data/stata4_115.dta b/pandas/tests/io/data/stata/stata4_115.dta similarity index 100% rename from pandas/tests/io/data/stata4_115.dta rename to pandas/tests/io/data/stata/stata4_115.dta diff --git a/pandas/tests/io/data/stata4_117.dta b/pandas/tests/io/data/stata/stata4_117.dta similarity index 100% rename from pandas/tests/io/data/stata4_117.dta rename to pandas/tests/io/data/stata/stata4_117.dta diff --git a/pandas/tests/io/data/stata5.csv b/pandas/tests/io/data/stata/stata5.csv similarity index 100% rename from pandas/tests/io/data/stata5.csv rename to pandas/tests/io/data/stata/stata5.csv diff --git a/pandas/tests/io/data/stata5_113.dta b/pandas/tests/io/data/stata/stata5_113.dta similarity index 100% rename from pandas/tests/io/data/stata5_113.dta rename to pandas/tests/io/data/stata/stata5_113.dta diff --git a/pandas/tests/io/data/stata5_114.dta b/pandas/tests/io/data/stata/stata5_114.dta similarity index 100% rename from pandas/tests/io/data/stata5_114.dta rename to pandas/tests/io/data/stata/stata5_114.dta diff --git a/pandas/tests/io/data/stata5_115.dta b/pandas/tests/io/data/stata/stata5_115.dta similarity index 100% rename from pandas/tests/io/data/stata5_115.dta rename to pandas/tests/io/data/stata/stata5_115.dta diff --git a/pandas/tests/io/data/stata5_117.dta b/pandas/tests/io/data/stata/stata5_117.dta similarity index 100% rename from pandas/tests/io/data/stata5_117.dta rename to pandas/tests/io/data/stata/stata5_117.dta diff --git a/pandas/tests/io/data/stata6.csv b/pandas/tests/io/data/stata/stata6.csv similarity index 100% rename from pandas/tests/io/data/stata6.csv rename to pandas/tests/io/data/stata/stata6.csv diff --git a/pandas/tests/io/data/stata6_113.dta b/pandas/tests/io/data/stata/stata6_113.dta similarity index 100% rename from pandas/tests/io/data/stata6_113.dta rename to pandas/tests/io/data/stata/stata6_113.dta diff --git a/pandas/tests/io/data/stata6_114.dta b/pandas/tests/io/data/stata/stata6_114.dta similarity index 100% rename from pandas/tests/io/data/stata6_114.dta rename to pandas/tests/io/data/stata/stata6_114.dta diff --git a/pandas/tests/io/data/stata6_115.dta b/pandas/tests/io/data/stata/stata6_115.dta similarity index 100% rename from pandas/tests/io/data/stata6_115.dta rename to pandas/tests/io/data/stata/stata6_115.dta diff --git a/pandas/tests/io/data/stata6_117.dta b/pandas/tests/io/data/stata/stata6_117.dta similarity index 100% rename from pandas/tests/io/data/stata6_117.dta rename to pandas/tests/io/data/stata/stata6_117.dta diff --git a/pandas/tests/io/data/stata7_111.dta b/pandas/tests/io/data/stata/stata7_111.dta similarity index 100% rename from pandas/tests/io/data/stata7_111.dta rename to pandas/tests/io/data/stata/stata7_111.dta diff --git a/pandas/tests/io/data/stata7_115.dta b/pandas/tests/io/data/stata/stata7_115.dta similarity index 100% rename from pandas/tests/io/data/stata7_115.dta rename to pandas/tests/io/data/stata/stata7_115.dta diff --git a/pandas/tests/io/data/stata7_117.dta b/pandas/tests/io/data/stata/stata7_117.dta similarity index 100% rename from pandas/tests/io/data/stata7_117.dta rename to pandas/tests/io/data/stata/stata7_117.dta diff --git a/pandas/tests/io/data/stata8_113.dta b/pandas/tests/io/data/stata/stata8_113.dta similarity index 100% rename from pandas/tests/io/data/stata8_113.dta rename to pandas/tests/io/data/stata/stata8_113.dta diff --git a/pandas/tests/io/data/stata8_115.dta b/pandas/tests/io/data/stata/stata8_115.dta similarity index 100% rename from pandas/tests/io/data/stata8_115.dta rename to pandas/tests/io/data/stata/stata8_115.dta diff --git a/pandas/tests/io/data/stata8_117.dta b/pandas/tests/io/data/stata/stata8_117.dta similarity index 100% rename from pandas/tests/io/data/stata8_117.dta rename to pandas/tests/io/data/stata/stata8_117.dta diff --git a/pandas/tests/io/data/stata9_115.dta b/pandas/tests/io/data/stata/stata9_115.dta similarity index 100% rename from pandas/tests/io/data/stata9_115.dta rename to pandas/tests/io/data/stata/stata9_115.dta diff --git a/pandas/tests/io/data/stata9_117.dta b/pandas/tests/io/data/stata/stata9_117.dta similarity index 100% rename from pandas/tests/io/data/stata9_117.dta rename to pandas/tests/io/data/stata/stata9_117.dta diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 843b3c08421b3..a257735dc1ec5 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -1,6 +1,8 @@ import pytest -import pandas.util.testing as tm +import pandas.util._test_decorators as td + +import pandas._testing as tm from pandas.io.parsers import read_csv @@ -24,11 +26,12 @@ def merge_cells(request): @pytest.fixture -def df_ref(): +def df_ref(datapath): """ Obtain the reference data from read_csv with the Python engine. """ - df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python") + filepath = datapath("io", "data", "csv", "test1.csv") + df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python") return df_ref @@ -38,3 +41,25 @@ def read_ext(request): Valid extensions for reading Excel files. """ return request.param + + +@pytest.fixture(autouse=True) +def check_for_file_leaks(): + """ + Fixture to run around every test to ensure that we are not leaking files. + + See also + -------- + _test_decorators.check_file_leaks + """ + # GH#30162 + psutil = td.safe_import("psutil") + if not psutil: + yield + + else: + proc = psutil.Process() + flist = proc.open_files() + yield + flist2 = proc.open_files() + assert flist == flist2 diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 47e610562a388..b9a3e8b59b133 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm pytest.importorskip("odf") @@ -13,7 +13,7 @@ def cd_and_set_engine(monkeypatch, datapath): func = functools.partial(pd.read_excel, engine="odf") monkeypatch.setattr(pd, "read_excel", func) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) def test_read_invalid_types_raises(): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index e9b4a5d4e430d..10ed192062d9c 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,7 +1,11 @@ +import os + +import numpy as np import pytest +import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter, _OpenpyxlWriter @@ -101,3 +105,20 @@ def test_write_append_mode(ext, mode, expected): for index, cell_value in enumerate(expected): assert wb2.worksheets[index]["A1"].value == cell_value + + +def test_to_excel_with_openpyxl_engine(ext, tmpdir): + # GH 29854 + # TODO: Fix this once newer version of openpyxl fixes the bug + df1 = DataFrame({"A": np.linspace(1, 10, 10)}) + df2 = DataFrame({"B": np.linspace(1, 20, 10)}) + df = pd.concat([df1, df2], axis=1) + styled = df.style.applymap( + lambda val: "color: %s" % "red" if val < 0 else "black" + ).highlight_max() + + filename = tmpdir / "styled.xlsx" + styled.to_excel(filename, engine="openpyxl") + + assert filename.exists() + os.remove(filename) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 1d3653f685e1e..629d3d02028bd 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -13,7 +13,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @contextlib.contextmanager @@ -31,84 +31,116 @@ def ignore_xlrd_time_clock_warning(): yield +read_ext_params = [".xls", ".xlsx", ".xlsm", ".ods"] +engine_params = [ + # Add any engines to test here + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param( + "xlrd", + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param( + "openpyxl", + marks=[ + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), + ], + ), + pytest.param( + None, + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param("odf", marks=td.skip_if_no("odf")), +] + + +def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: + """ + Filter out invalid (engine, ext) pairs instead of skipping, as that + produces 500+ pytest.skips. + """ + engine = engine.values[0] + if engine == "openpyxl" and read_ext == ".xls": + return False + if engine == "odf" and read_ext != ".ods": + return False + if read_ext == ".ods" and engine != "odf": + return False + return True + + +def _transfer_marks(engine, read_ext): + """ + engine gives us a pytest.param objec with some marks, read_ext is just + a string. We need to generate a new pytest.param inheriting the marks. + """ + values = engine.values + (read_ext,) + new_param = pytest.param(values, marks=engine.marks) + return new_param + + @pytest.fixture( + autouse=True, params=[ - # Add any engines to test here - # When defusedxml is installed it triggers deprecation warnings for - # xlrd and openpyxl, so catch those here - pytest.param( - "xlrd", - marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ], - ), - pytest.param( - "openpyxl", - marks=[ - td.skip_if_no("openpyxl"), - pytest.mark.filterwarnings("ignore:.*html argument"), - ], - ), - pytest.param( - None, - marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ], - ), - pytest.param("odf", marks=td.skip_if_no("odf")), - ] + _transfer_marks(eng, ext) + for eng in engine_params + for ext in read_ext_params + if _is_valid_engine_ext_pair(eng, ext) + ], ) -def engine(request): +def engine_and_read_ext(request): """ - A fixture for Excel reader engines. + Fixture for Excel reader engine and read_ext, only including valid pairs. """ return request.param +@pytest.fixture +def engine(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return engine + + +@pytest.fixture +def read_ext(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return read_ext + + class TestReaders: @pytest.fixture(autouse=True) - def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): + def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for read_excel calls. """ - if engine == "openpyxl" and read_ext == ".xls": - pytest.skip() - if engine == "odf" and read_ext != ".ods": - pytest.skip() - if read_ext == ".ods" and engine != "odf": - pytest.skip() func = partial(pd.read_excel, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "read_excel", func) def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + msg = "Passing an integer for `usecols`" + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel( - "test1" + read_ext, "Sheet1", index_col=0, usecols=3 - ) + pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): + with pytest.raises(ValueError, match=msg): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel( + pd.read_excel( "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 ) - # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - def test_usecols_list(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["B", "C"]) @@ -499,12 +531,10 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == ".ods": # TODO: remove once on master - pytest.skip() url = ( - "https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/test1" + read_ext + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/excel/test1" + read_ext ) url_table = pd.read_excel(url) local_table = pd.read_excel("test1" + read_ext) @@ -527,7 +557,7 @@ def test_read_from_s3_url(self, read_ext, s3_resource): def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), "test1" + read_ext) + localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext) local_table = pd.read_excel(localtable) try: @@ -554,6 +584,7 @@ def test_read_from_pathlib_path(self, read_ext): tm.assert_frame_equal(expected, actual) @td.skip_if_no("py.path") + @td.check_file_leaks def test_read_from_py_localpath(self, read_ext): # GH12655 @@ -816,19 +847,13 @@ def test_read_excel_squeeze(self, read_ext): class TestExcelFileRead: @pytest.fixture(autouse=True) - def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): + def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for ExcelFile objects. """ - if engine == "odf" and read_ext != ".ods": - pytest.skip() - if read_ext == ".ods" and engine != "odf": - pytest.skip() - if engine == "openpyxl" and read_ext == ".xls": - pytest.skip() func = partial(pd.ExcelFile, engine=engine) - monkeypatch.chdir(datapath("io", "data")) + monkeypatch.chdir(datapath("io", "data", "excel")) monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): @@ -892,10 +917,11 @@ def test_excel_passes_na_filter(self, read_ext, na_filter): tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"]) + @td.check_file_leaks def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 kwarg = {arg: "Sheet1"} - msg = "unexpected keyword argument `{}`".format(arg) + msg = r"unexpected keyword argument `{}`".format(arg) with pd.ExcelFile("test1" + read_ext) as excel: with pytest.raises(TypeError, match=msg): @@ -919,14 +945,6 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False - ): - with pd.ExcelFile("test1" + read_ext) as excel: - df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) - - tm.assert_frame_equal(df3, df4) - with pd.ExcelFile("test1" + read_ext) as excel: df3 = excel.parse(0, index_col=0, skipfooter=1) @@ -970,3 +988,13 @@ def test_conflicting_excel_engines(self, read_ext): with pd.ExcelFile("test1" + read_ext) as xl: with pytest.raises(ValueError, match=msg): pd.read_excel(xl, engine="foo") + + def test_excel_read_binary(self, engine, read_ext): + # GH 15914 + expected = pd.read_excel("test1" + read_ext, engine=engine) + + with open("test1" + read_ext, "rb") as f: + data = f.read() + + actual = pd.read_excel(data, engine=engine) + tm.assert_frame_equal(expected, actual) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 41363bf13ed4e..88f4c3736bc0d 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter from pandas.io.formats.excel import ExcelFormatter diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 1bc4ad3e7867a..55b987a599670 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -6,12 +6,11 @@ import numpy as np import pytest -from pandas.compat import PY36 import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, Index, MultiIndex, get_option, set_option -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ( ExcelFile, @@ -253,7 +252,7 @@ def test_read_excel_parse_dates(self, ext): res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel( pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 ) @@ -810,6 +809,7 @@ def test_to_excel_unicode_filename(self, ext, path): ) tm.assert_frame_equal(result, expected) + # FIXME: dont leave commented-out # def test_to_excel_header_styling_xls(self, engine, ext): # import StringIO @@ -1010,13 +1010,9 @@ def test_invalid_columns(self, path): # see gh-10982 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with pytest.raises(KeyError, match="Not all names specified"): write_frame.to_excel(path, "test1", columns=["B", "C"]) - expected = write_frame.reindex(columns=["B", "C"]) - read_frame = pd.read_excel(path, "test1", index_col=0) - tm.assert_frame_equal(expected, read_frame) - with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): @@ -1262,7 +1258,6 @@ def check_called(func): @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") -@pytest.mark.skipif(not PY36, reason="requires fspath") class TestFSPath: def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index fc36be9e1b738..d1f900a2dc58b 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelFile @@ -35,7 +35,7 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", "test1{}".format(read_ext)) + path = datapath("io", "data", "excel", "test1{}".format(read_ext)) with pd.ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, "asdf") diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index deb72cc230669..b6f791434a92b 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -3,7 +3,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index c6af78c2704d8..01feab08eb5e3 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter, _XlwtWriter diff --git a/pandas/tests/io/formats/data/html/render_links_false.html b/pandas/tests/io/formats/data/html/render_links_false.html index 6509a0e985597..6feb403d63051 100644 --- a/pandas/tests/io/formats/data/html/render_links_false.html +++ b/pandas/tests/io/formats/data/html/render_links_false.html @@ -11,7 +11,7 @@ - + diff --git a/pandas/tests/io/formats/data/html/render_links_true.html b/pandas/tests/io/formats/data/html/render_links_true.html index e9cb5632aad1d..3eb53f3160a77 100644 --- a/pandas/tests/io/formats/data/html/render_links_true.html +++ b/pandas/tests/io/formats/data/html/render_links_true.html @@ -11,7 +11,7 @@ - + diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index a6ad5d5edbf5f..7008cef7b28fa 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSResolver, CSSWarning @@ -101,29 +101,25 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - "{shorthand}: 1pt".format(shorthand=shorthand), - {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt".format(shorthand=shorthand), - {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt 0pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt 0pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) with tm.assert_produces_warning(CSSWarning): - assert_resolves( - "{shorthand}: 1pt 1pt 1pt 1pt 1pt".format(shorthand=shorthand), {} - ) + assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) @pytest.mark.parametrize( @@ -174,10 +170,10 @@ def test_css_none_absent(style, equiv): "size,resolved", [ ("xx-small", "6pt"), - ("x-small", "{pt:f}pt".format(pt=7.5)), - ("small", "{pt:f}pt".format(pt=9.6)), + ("x-small", f"{7.5:f}pt"), + ("small", f"{9.6:f}pt"), ("medium", "12pt"), - ("large", "{pt:f}pt".format(pt=13.5)), + ("large", f"{13.5:f}pt"), ("x-large", "18pt"), ("xx-large", "24pt"), ("8px", "6pt"), @@ -196,9 +192,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) @@ -224,7 +218,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): ("inherit", "16pt", "16pt"), ("smaller", None, "10pt"), ("smaller", "18pt", "15pt"), - ("larger", None, "{pt:f}pt".format(pt=14.4)), + ("larger", None, f"{14.4:f}pt"), ("larger", "15pt", "18pt"), ], ) @@ -234,7 +228,5 @@ def test_css_relative_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 2edbff3766c9d..6801316ada8a3 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 704de378b0909..97956489e7da6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas.compat import PY36, is_platform_32bit, is_platform_windows +from pandas.compat import is_platform_32bit, is_platform_windows import pandas as pd from pandas import ( @@ -35,7 +35,7 @@ reset_option, set_option, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing @@ -62,10 +62,7 @@ def filepath_or_buffer(filepath_or_buffer_id, tmp_path): yield buf assert not buf.closed else: - if PY36: - assert isinstance(tmp_path, Path) - else: - assert hasattr(tmp_path, "__fspath__") + assert isinstance(tmp_path, Path) if filepath_or_buffer_id == "pathlike": yield tmp_path / "foo" else: @@ -424,12 +421,10 @@ def test_repr_truncation_column_size(self): def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - pytest.skip( - "terminal size too small, {0} x {1}".format(term_width, term_height) - ) + pytest.skip(f"terminal size too small, {term_width} x {term_height}") def mkframe(n): - index = ["{i:05d}".format(i=i) for i in range(n)] + index = [f"{i:05d}" for i in range(n)] return DataFrame(0, index, index) df6 = mkframe(6) @@ -449,7 +444,7 @@ def mkframe(n): assert not has_truncated_repr(df6) with option_context("display.max_rows", 9, "display.max_columns", 10): - # out vertical bounds can not result in exanded repr + # out vertical bounds can not result in expanded repr assert not has_expanded_repr(df10) assert has_vertically_truncated_repr(df10) @@ -670,9 +665,9 @@ def test_to_string_with_formatters(self): ) formatters = [ - ("int", lambda x: "0x{x:x}".format(x=x)), - ("float", lambda x: "[{x: 4.1f}]".format(x=x)), - ("object", lambda x: "-{x!s}-".format(x=x)), + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), ] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) @@ -714,7 +709,7 @@ def format_func(x): def test_to_string_with_formatters_unicode(self): df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": lambda x: "{x}".format(x=x)}) + result = df.to_string(formatters={"c/\u03c3": str}) assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" def test_east_asian_unicode_false(self): @@ -1020,7 +1015,7 @@ def test_east_asian_unicode_true(self): def test_to_string_buffer_all_unicode(self): buf = StringIO() - empty = DataFrame({"c/\u03c3": Series()}) + empty = DataFrame({"c/\u03c3": Series(dtype=object)}) nonempty = DataFrame({"c/\u03c3": Series([1, 2, 3])}) print(empty, file=buf) @@ -1109,6 +1104,15 @@ def test_truncate_with_different_dtypes(self): assert "None" in result assert "NaN" not in result + def test_truncate_with_different_dtypes_multiindex(self): + # GH#13000 + df = DataFrame({"Vals": range(100)}) + frame = pd.concat([df], keys=["Sweep"], names=["Sweep", "Index"]) + result = repr(frame) + + result2 = repr(frame.iloc[:5]) + assert result.startswith(result2) + def test_datetimelike_frame(self): # GH 12211 @@ -1234,7 +1238,7 @@ def test_wide_repr(self): set_option("display.expand_frame_repr", False) rep_str = repr(df) - assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str + assert f"10 rows x {max_cols - 1} columns" in rep_str set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr @@ -1345,7 +1349,7 @@ def test_long_series(self): n = 1000 s = Series( np.random.randint(-50, 50, n), - index=["s{x:04d}".format(x=x) for x in range(n)], + index=[f"s{x:04d}" for x in range(n)], dtype="int64", ) @@ -1471,9 +1475,7 @@ def test_to_string(self): expected = ["A"] assert header == expected - biggie.to_string( - columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)} - ) + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) biggie.to_string(columns=["B", "A"], float_format=str) biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) @@ -1604,7 +1606,7 @@ def test_to_string_small_float_values(self): result = df.to_string() # sadness per above - if "{x:.4g}".format(x=1.7e8) == "1.7e+008": + if _three_digit_exp(): expected = ( " a\n" "0 1.500000e+000\n" @@ -1916,7 +1918,7 @@ def test_repr_html_long(self): long_repr = df._repr_html_() assert ".." in long_repr assert str(41 + max_rows // 2) not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_float(self): @@ -1933,7 +1935,7 @@ def test_repr_html_float(self): ).set_index("idx") reg_repr = df._repr_html_() assert ".." not in reg_repr - assert "".format(val=str(40 + h)) in reg_repr + assert f"" in reg_repr h = max_rows + 1 df = DataFrame( @@ -1945,8 +1947,8 @@ def test_repr_html_float(self): ).set_index("idx") long_repr = df._repr_html_() assert ".." in long_repr - assert "".format(val="31") not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert "" not in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_long_multiindex(self): @@ -2175,9 +2177,7 @@ def test_to_string(self): cp.name = "foo" result = cp.to_string(length=True, name=True, dtype=True) last_line = result.split("\n")[-1].strip() - assert last_line == ( - "Freq: B, Name: foo, Length: {cp}, dtype: float64".format(cp=len(cp)) - ) + assert last_line == (f"Freq: B, Name: foo, Length: {len(cp)}, dtype: float64") def test_freq_name_separation(self): s = Series( @@ -2379,7 +2379,8 @@ def test_east_asian_unicode_series(self): # object dtype, longer than unicode repr s = Series( - [1, 22, 3333, 44444], index=[1, "AB", pd.Timestamp("2011-01-01"), "あああ"] + [1, 22, 3333, 44444], + index=[1, "AB", pd.Timestamp("2011-01-01"), "あああ"], ) expected = ( "1 1\n" @@ -2658,14 +2659,14 @@ def test_format_explicit(self): assert exp == res res = repr(test_sers["asc"]) exp = ( - "0 a\n1 ab\n ... \n4 abcde\n5" - " abcdef\ndtype: object" + "0 a\n1 ab\n ... \n4 abcde\n5 " + "abcdef\ndtype: object" ) assert exp == res res = repr(test_sers["desc"]) exp = ( - "5 abcdef\n4 abcde\n ... \n1 ab\n0" - " a\ndtype: object" + "5 abcdef\n4 abcde\n ... \n1 ab\n0 " + "a\ndtype: object" ) assert exp == res @@ -2768,14 +2769,14 @@ def test_to_string_length(self): assert res == exp def test_to_string_na_rep(self): - s = pd.Series(index=range(100)) + s = pd.Series(index=range(100), dtype=np.float64) res = s.to_string(na_rep="foo", max_rows=2) exp = "0 foo\n ..\n99 foo" assert res == exp def test_to_string_float_format(self): s = pd.Series(range(10), dtype="float64") - res = s.to_string(float_format=lambda x: "{0:2.1f}".format(x), max_rows=2) + res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) exp = "0 0.0\n ..\n9 9.0" assert res == exp @@ -2800,7 +2801,7 @@ def test_to_string_multindex_header(self): def _three_digit_exp(): - return "{x:.4g}".format(x=1.7e8) == "1.7e+008" + return f"{1.7e8:.4g}" == "1.7e+008" class TestFloatArrayFormatter: @@ -3262,8 +3263,9 @@ def test_filepath_or_buffer_arg( ): getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) elif encoding == "foo": - with pytest.raises(LookupError, match="unknown encoding"): - getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + with tm.assert_produces_warning(None): + with pytest.raises(LookupError, match="unknown encoding"): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) else: expected = getattr(df, method)() getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 0f1402d7da389..e5dac18acedf6 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip @@ -24,7 +24,7 @@ def setup_method(self, method): self.g = lambda x: x def h(x, foo="bar"): - return pd.Series("color: {foo}".format(foo=foo), index=x.index, name=x.name) + return pd.Series(f"color: {foo}", index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) @@ -278,7 +278,7 @@ def test_numeric_columns(self): def test_apply_axis(self): df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) - f = lambda x: ["val: {max}".format(max=x.max()) for v in x] + f = lambda x: [f"val: {x.max()}" for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 assert len(result.ctx) == 0 @@ -362,7 +362,7 @@ def color_negative_red(val): strings, black otherwise. """ color = "red" if val < 0 else "black" - return "color: {color}".format(color=color) + return f"color: {color}" dic = { ("a", "d"): [-1.12, 2.11], @@ -376,6 +376,25 @@ def color_negative_red(val): (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) + def test_applymap_subset_multiindex_code(self): + # https://github.com/pandas-dev/pandas/issues/25858 + # Checks styler.applymap works with multindex when codes are provided + codes = np.array([[0, 0, 1, 1], [0, 1, 0, 1]]) + columns = pd.MultiIndex( + levels=[["a", "b"], ["%", "#"]], codes=codes, names=["", ""] + ) + df = DataFrame( + [[1, -1, 1, 1], [-1, 1, 1, 1]], index=["hello", "world"], columns=columns + ) + pct_subset = pd.IndexSlice[:, pd.IndexSlice[:, "%":"%"]] + + def color_negative_red(val): + color = "red" if val < 0 else "black" + return f"color: {color}" + + df.loc[pct_subset] + df.style.applymap(color_negative_red, subset=pct_subset) + def test_where_with_one_style(self): # GH 17474 def f(x): @@ -511,20 +530,17 @@ def test_bar_align_left_0points(self): (1, 0): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 2): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (2, 0): [ "width: 10em", @@ -553,8 +569,7 @@ def test_bar_align_left_0points(self): (0, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (0, 2): [ "width: 10em", @@ -990,6 +1005,75 @@ def test_bar_bad_align_raises(self): with pytest.raises(ValueError): df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) + def test_format_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + + ctx = df.style.format("{:.2%}", na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "110.00%" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate() + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "120.00%" + + def test_init_with_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = Styler(df, na_rep="NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + def test_set_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + + ctx = ( + df.style.set_na_rep("NA") + .format(None, na_rep="-", subset=["B"]) + ._translate() + ) + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "-" + + def test_format_non_numeric_na(self): + # GH 21527 28358 + df = pd.DataFrame( + { + "object": [None, np.nan, "foo"], + "datetime": [None, pd.NaT, pd.Timestamp("20120101")], + } + ) + + ctx = df.style.set_na_rep("NA")._translate() + assert ctx["body"][0][1]["display_value"] == "NA" + assert ctx["body"][0][2]["display_value"] == "NA" + assert ctx["body"][1][1]["display_value"] == "NA" + assert ctx["body"][1][2]["display_value"] == "NA" + + ctx = df.style.format(None, na_rep="-")._translate() + assert ctx["body"][0][1]["display_value"] == "-" + assert ctx["body"][0][2]["display_value"] == "-" + assert ctx["body"][1][1]["display_value"] == "-" + assert ctx["body"][1][2]["display_value"] == "-" + + def test_format_with_bad_na_rep(self): + # GH 21527 28358 + df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"]) + with pytest.raises(TypeError): + df.style.format(None, na_rep=-1) + def test_highlight_null(self, null_color="red"): df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx @@ -1127,13 +1211,9 @@ def test_highlight_max(self): def test_export(self): f = lambda x: "color: red" if x > 0 else "color: blue" - g = ( - lambda x, y, z: "color: {z}".format(z=z) - if x > 0 - else "color: {z}".format(z=z) - ) + g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" style1 = self.styler - style1.applymap(f).applymap(g, y="a", z="b").highlight_max() + style1.applymap(f).applymap(g, z="b").highlight_max() result = style1.export() style2 = self.df.style style2.use(result) @@ -1557,9 +1637,7 @@ def test_hide_columns_mult_levels(self): def test_pipe(self): def set_caption_from_template(styler, a, b): - return styler.set_caption( - "Dataframe with a = {a} and b = {b}".format(a=a, b=b) - ) + return styler.set_caption(f"Dataframe with a = {a} and b = {b}") styler = self.df.style.pipe(set_caption_from_template, "A", b="B") assert "Dataframe with a = A and b = B" in styler.render() @@ -1648,6 +1726,23 @@ def test_background_gradient_axis(self): assert result[(1, 0)] == mid assert result[(1, 1)] == high + def test_background_gradient_vmin_vmax(self): + # GH 12145 + df = pd.DataFrame(range(5)) + ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx + assert ctx[(0, 0)] == ctx[(1, 0)] + assert ctx[(4, 0)] == ctx[(3, 0)] + + def test_background_gradient_int64(self): + # GH 28869 + df1 = pd.Series(range(3)).to_frame() + df2 = pd.Series(range(3), dtype="Int64").to_frame() + ctx1 = df1.style.background_gradient()._compute().ctx + ctx2 = df2.style.background_gradient()._compute().ctx + assert ctx2[(0, 0)] == ctx1[(0, 0)] + assert ctx2[(1, 0)] == ctx1[(1, 0)] + assert ctx2[(2, 0)] == ctx1[(2, 0)] + def test_block_names(): # catch accidental removal of a block diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 80edbd828194d..a211ac11cf725 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import io import os import sys @@ -6,7 +7,7 @@ import pandas as pd from pandas import DataFrame, compat -import pandas.util.testing as tm +import pandas._testing as tm class TestToCSV: @@ -204,6 +205,14 @@ def test_to_csv_na_rep(self): assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") + assert expected == csv + def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) @@ -376,16 +385,14 @@ def test_to_csv_string_with_lf(self): assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator - expected_lf = b"int,str_lf\n" b"1,abc\n" b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n' + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element - expected_crlf = ( - b"int,str_lf\r\n" b"1,abc\r\n" b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n' - ) + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' df.to_csv(path, line_terminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf @@ -412,9 +419,7 @@ def test_to_csv_string_with_crlf(self): assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator - expected_lf = ( - b"int,str_crlf\n" b"1,abc\n" b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n' - ) + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf @@ -490,10 +495,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): compression = compression_only if compression == "zip": - pytest.skip( - "{compression} is not supported " - "for to_csv".format(compression=compression) - ) + pytest.skip(f"{compression} is not supported for to_csv") # We'll complete file extension subsequently. filename = "test." @@ -567,3 +569,17 @@ def test_to_csv_na_rep_long_string(self, df_new_type): result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 1440b0a6f06f1..883240b74c32c 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -5,7 +5,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSWarning from pandas.io.formats.excel import CSSToExcelConverter @@ -262,7 +262,7 @@ def test_css_to_excel_inherited(css, inherited, expected): @pytest.mark.parametrize( "input_color,output_color", ( - [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + list(CSSToExcelConverter.NAMED_COLORS.items()) + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] ), @@ -270,13 +270,13 @@ def test_css_to_excel_inherited(css, inherited, expected): def test_css_to_excel_good_colors(input_color, output_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() @@ -297,13 +297,13 @@ def test_css_to_excel_good_colors(input_color, output_color): def test_css_to_excel_bad_colors(input_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a2a577a0753f7..d3f044a42eb28 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -7,18 +7,18 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, option_context -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt lorem_ipsum = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex" - " ea commodo consequat. Duis aute irure dolor in reprehenderit in" - " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" - " sint occaecat cupidatat non proident, sunt in culpa qui officia" - " deserunt mollit anim id est laborum." + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex " + "ea commodo consequat. Duis aute irure dolor in reprehenderit in " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur " + "sint occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum." ) @@ -688,7 +688,7 @@ def test_to_html_float_format_no_fixed_width(value, float_format, expected, data def test_to_html_render_links(render_links, expected, datapath): # GH 2679 data = [ - [0, "http://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], [0, "www.pydata.org", "pydata.org"], ] df = DataFrame(data, columns=["foo", "bar", None]) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index ea8688517bd93..bd681032f155d 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestToLatex: diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py new file mode 100644 index 0000000000000..8893e4294353f --- /dev/null +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -0,0 +1,55 @@ +from io import StringIO + +import pytest + +import pandas as pd + +pytest.importorskip("tabulate") + + +def test_simple(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf) + result = buf.getvalue() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_other_tablefmt(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, tablefmt="jira") + result = buf.getvalue() + assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + + +def test_other_headers(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, headers=["foo", "bar"]) + result = buf.getvalue() + assert result == ( + "| foo | bar |\n|------:|------:|\n| 0 " + "| 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_series(): + buf = StringIO() + s = pd.Series([1, 2, 3], name="foo") + s.to_markdown(buf=buf) + result = buf.getvalue() + assert result == ( + "| | foo |\n|---:|------:|\n| 0 | 1 " + "|\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_no_buf(capsys): + df = pd.DataFrame([1, 2, 3]) + result = df.to_markdown() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index e63644a44a81f..6ef0e0457e2e2 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -self-contained to write legacy storage (pickle/msgpack) files +self-contained to write legacy storage pickle files To use this script. Create an environment where you want generate pickles, say its for 0.20.3, with your pandas clone @@ -58,7 +58,6 @@ date_range, period_range, timedelta_range, - to_msgpack, ) from pandas.tseries.offsets import ( @@ -136,7 +135,7 @@ def _create_sp_frame(): def create_data(): - """ create the pickle/msgpack data """ + """ create the pickle data """ data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], @@ -306,28 +305,6 @@ def create_pickle_data(): return data -def _u(x): - return {k: _u(x[k]) for k in x} if isinstance(x, dict) else x - - -def create_msgpack_data(): - data = create_data() - # Not supported - del data["sp_series"] - del data["sp_frame"] - del data["series"]["cat"] - del data["series"]["period"] - del data["frame"]["cat_onecol"] - del data["frame"]["cat_and_float"] - del data["scalars"]["period"] - if _loose_version >= LooseVersion("0.21") and ( - _loose_version < LooseVersion("0.23.0") - ): - del data["index"]["interval"] - del data["offsets"] - return _u(data) - - def platform_name(): return "_".join( [ @@ -360,23 +337,6 @@ def write_legacy_pickles(output_dir): print("created pickle file: {pth}".format(pth=pth)) -def write_legacy_msgpack(output_dir, compress): - - version = pandas.__version__ - - print( - "This script generates a storage file for the current arch, " - "system, and python version" - ) - print(" pandas version: {0}".format(version)) - print(" output dir : {0}".format(output_dir)) - print(" storage format: msgpack") - pth = "{0}.msgpack".format(platform_name()) - to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), compress=compress) - - print("created msgpack file: {pth}".format(pth=pth)) - - def write_legacy_file(): # force our cwd to be the first searched sys.path.insert(0, ".") @@ -385,22 +345,15 @@ def write_legacy_file(): exit( "Specify output directory and storage type: generate_legacy_" "storage_files.py " - "" ) output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) - try: - compress_type = str(sys.argv[3]) - except IndexError: - compress_type = None if storage_type == "pickle": write_legacy_pickles(output_dir=output_dir) - elif storage_type == "msgpack": - write_legacy_msgpack(output_dir=output_dir, compress=compress_type) else: - exit("storage_type must be one of {'pickle', 'msgpack'}") + exit("storage_type must be one of {'pickle'}") if __name__ == "__main__": diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index adbb9dfbd2ddf..182c21ed1d416 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_compression_roundtrip(compression): @@ -90,10 +90,7 @@ def test_to_json_compression(compression_only, read_infer, to_infer): compression = compression_only if compression == "zip": - pytest.skip( - "{compression} is not supported " - "for to_csv".format(compression=compression) - ) + pytest.skip(f"{compression} is not supported for to_csv") # We'll complete file extension subsequently. filename = "test." diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 569e299860614..2ac2acc6748d1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -5,13 +5,11 @@ import numpy as np import pytest -from pandas.compat import PY35 - from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._table_schema import ( as_json_table_type, @@ -22,14 +20,6 @@ ) -def assert_results_equal(result, expected): - """Helper function for comparing deserialized JSON with Py35 compat.""" - if PY35: - assert sorted(result.items()) == sorted(expected.items()) - else: - assert result == expected - - class TestBuildSchema: def setup_method(self, method): self.df = DataFrame( @@ -245,7 +235,7 @@ def test_build_series(self): ] ) - assert_results_equal(result, expected) + assert result == expected def test_to_json(self): df = self.df.copy() @@ -335,7 +325,7 @@ def test_to_json(self): ] expected = OrderedDict([("schema", schema), ("data", data)]) - assert_results_equal(result, expected) + assert result == expected def test_to_json_float_index(self): data = pd.Series(1, index=[1.0, 2.0]) @@ -365,7 +355,7 @@ def test_to_json_float_index(self): ] ) - assert_results_equal(result, expected) + assert result == expected def test_to_json_period_index(self): idx = pd.period_range("2016", freq="Q-JAN", periods=2) @@ -386,7 +376,7 @@ def test_to_json_period_index(self): ] expected = OrderedDict([("schema", schema), ("data", data)]) - assert_results_equal(result, expected) + assert result == expected def test_to_json_categorical_index(self): data = pd.Series(1, pd.CategoricalIndex(["a", "b"])) @@ -421,7 +411,7 @@ def test_to_json_categorical_index(self): ] ) - assert_results_equal(result, expected) + assert result == expected def test_date_format_raises(self): with pytest.raises(ValueError): @@ -431,15 +421,15 @@ def test_date_format_raises(self): self.df.to_json(orient="table", date_format="iso") self.df.to_json(orient="table") - @pytest.mark.parametrize("kind", [pd.Series, pd.Index]) - def test_convert_pandas_type_to_json_field_int(self, kind): + def test_convert_pandas_type_to_json_field_int(self, index_or_series): + kind = index_or_series data = [1, 2, 3] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "integer"} assert result == expected - @pytest.mark.parametrize("kind", [pd.Series, pd.Index]) - def test_convert_pandas_type_to_json_field_float(self, kind): + def test_convert_pandas_type_to_json_field_float(self, index_or_series): + kind = index_or_series data = [1.0, 2.0, 3.0] result = convert_pandas_type_to_json_field(kind(data, name="name")) expected = {"name": "name", "type": "number"} @@ -523,7 +513,7 @@ def test_convert_json_field_to_pandas_type(self, inp, exp): def test_convert_json_field_to_pandas_type_raises(self, inp): field = {"type": inp} with pytest.raises( - ValueError, match=("Unsupported or invalid field type: {}".format(inp)) + ValueError, match=f"Unsupported or invalid field type: {inp}" ): convert_json_field_to_pandas_type(field) @@ -558,7 +548,7 @@ def test_categorical(self): ] ) - assert_results_equal(result, expected) + assert result == expected @pytest.mark.parametrize( "idx,nm,prop", diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index a3ca61cb1eb63..efb95a0cb2a42 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,12 +3,9 @@ import numpy as np import pytest -from pandas.compat import PY36 +from pandas import DataFrame, Index, json_normalize +import pandas._testing as tm -from pandas import DataFrame, Index -import pandas.util.testing as tm - -from pandas.io.json import json_normalize from pandas.io.json._normalize import nested_to_record @@ -382,7 +379,7 @@ def test_missing_field(self, author_missing_data): }, ] expected = DataFrame(ex_data) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "max_level,expected", @@ -465,6 +462,30 @@ def test_nested_flattening_consistent(self): # They should be the same. tm.assert_frame_equal(df1, df2) + def test_nonetype_record_path(self, nulls_fixture): + # see gh-30148 + # should not raise TypeError + result = json_normalize( + [ + {"state": "Texas", "info": nulls_fixture}, + {"state": "Florida", "info": [{"i": 2}]}, + ], + record_path=["info"], + ) + expected = DataFrame({"i": 2}, index=[0]) + tm.assert_equal(result, expected) + + def test_non_interable_record_path_errors(self): + # see gh-30148 + test_input = {"state": "Texas", "info": 1} + test_path = "info" + msg = ( + f"{test_input} has non iterable value 1 for path {test_path}. " + "Must be iterable or null." + ) + with pytest.raises(TypeError, match=msg): + json_normalize([test_input], record_path=[test_path]) + class TestNestedToRecord: def test_flat_stays_flat(self): @@ -524,7 +545,7 @@ def test_missing_meta(self, missing_metadata): columns = ["city", "number", "state", "street", "zip", "name"] columns = ["number", "street", "city", "state", "zip", "name"] expected = DataFrame(ex_data, columns=columns) - tm.assert_frame_equal(result, expected, check_like=not PY36) + tm.assert_frame_equal(result, expected) def test_donot_drop_nonevalues(self): # GH21356 @@ -700,3 +721,10 @@ def test_with_large_max_level(self): ] output = nested_to_record(input_data, max_level=max_level) assert output == expected + + def test_deprecated_import(self): + with tm.assert_produces_warning(FutureWarning): + from pandas.io.json import json_normalize + + recs = [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}] + json_normalize(recs) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index eaa46c4e9dc9b..e909a4952948c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -7,12 +7,12 @@ import numpy as np import pytest -from pandas.compat import PY35, is_platform_32bit, is_platform_windows +from pandas.compat import is_platform_32bit, is_platform_windows import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json -import pandas.util.testing as tm +import pandas._testing as tm _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -39,6 +39,7 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(scope="function", autouse=True) def setup(self, datapath): @@ -53,7 +54,7 @@ def setup(self, datapath): self.objSeries = tm.makeObjectSeries() self.objSeries.name = "objects" - self.empty_series = Series([], index=[]) + self.empty_series = Series([], index=[], dtype=np.float64) self.empty_frame = DataFrame() self.frame = _frame.copy() @@ -105,7 +106,7 @@ def test_frame_non_unique_index(self, orient): @pytest.mark.parametrize("orient", ["index", "columns"]) def test_frame_non_unique_index_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 1], columns=["x", "y"]) - msg = "DataFrame index must be unique for orient='{}'".format(orient) + msg = f"DataFrame index must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) @@ -142,7 +143,7 @@ def test_frame_non_unique_columns(self, orient, data): def test_frame_non_unique_columns_raises(self, orient): df = DataFrame([["a", "b"], ["c", "d"]], index=[1, 2], columns=["x", "x"]) - msg = "DataFrame columns must be unique for orient='{}'".format(orient) + msg = f"DataFrame columns must be unique for orient='{orient}'" with pytest.raises(ValueError, match=msg): df.to_json(orient=orient) @@ -160,9 +161,6 @@ def test_roundtrip_simple(self, orient, convert_axes, numpy, dtype): expected = self.frame.copy() - if not numpy and PY35 and orient in ("index", "columns"): - expected = expected.sort_index() - assert_json_roundtrip_equal(result, expected, orient) @pytest.mark.parametrize("dtype", [False, np.int64]) @@ -174,9 +172,6 @@ def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype): data, orient=orient, convert_axes=convert_axes, numpy=numpy, dtype=dtype ) expected = self.intframe.copy() - if not numpy and PY35 and orient in ("index", "columns"): - expected = expected.sort_index() - if ( numpy and (is_platform_32bit() or is_platform_windows()) @@ -209,9 +204,6 @@ def test_roundtrip_str_axes(self, orient, convert_axes, numpy, dtype): ) expected = df.copy() - if not numpy and PY35 and orient in ("index", "columns"): - expected = expected.sort_index() - if not dtype: expected = expected.astype(np.int64) @@ -234,13 +226,11 @@ def test_roundtrip_str_axes(self, orient, convert_axes, numpy, dtype): def test_roundtrip_categorical(self, orient, convert_axes, numpy): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): - pytest.xfail( - "Can't have duplicate index values for orient '{}')".format(orient) - ) + pytest.xfail(f"Can't have duplicate index values for orient '{orient}')") data = self.categorical.to_json(orient=orient) if numpy and orient in ("records", "values"): - pytest.xfail("Orient {} is broken with numpy=True".format(orient)) + pytest.xfail(f"Orient {orient} is broken with numpy=True") result = pd.read_json( data, orient=orient, convert_axes=convert_axes, numpy=numpy @@ -250,7 +240,7 @@ def test_roundtrip_categorical(self, orient, convert_axes, numpy): expected.index = expected.index.astype(str) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON - if not numpy and (orient == "index" or (PY35 and orient == "columns")): + if not numpy and orient == "index": expected = expected.sort_index() assert_json_roundtrip_equal(result, expected, orient) @@ -317,7 +307,7 @@ def test_roundtrip_mixed(self, orient, convert_axes, numpy): expected = df.copy() expected = expected.assign(**expected.select_dtypes("number").astype(np.int64)) - if not numpy and (orient == "index" or (PY35 and orient == "columns")): + if not numpy and orient == "index": expected = expected.sort_index() assert_json_roundtrip_equal(result, expected, orient) @@ -408,7 +398,7 @@ def test_frame_infinity(self, orient, inf, dtype): def test_frame_to_json_float_precision(self, value, precision, expected_val): df = pd.DataFrame([dict(a_float=value)]) encoded = df.to_json(double_precision=precision) - assert encoded == '{{"a_float":{{"0":{}}}}}'.format(expected_val) + assert encoded == f'{{"a_float":{{"0":{expected_val}}}}}' def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) @@ -594,7 +584,7 @@ def __init__(self, hexed): self.hexed = hexed self.binary = bytes.fromhex(hexed) - def __str__(self): + def __str__(self) -> str: return self.hexed hexed = "574b4454ba8c5eb4f98a8f45" @@ -602,7 +592,7 @@ def __str__(self): # verify the proper conversion of printable content df_printable = DataFrame({"A": [binthing.hexed]}) - assert df_printable.to_json() == '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed) + assert df_printable.to_json() == f'{{"A":{{"0":"{hexed}"}}}}' # check if non-printable content throws appropriate Exception df_nonprintable = DataFrame({"A": [binthing]}) @@ -616,19 +606,19 @@ def __str__(self): df_mixed.to_json() # default_handler should resolve exceptions for non-string types - assert df_nonprintable.to_json( - default_handler=str - ) == '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed) - assert df_mixed.to_json( - default_handler=str - ) == '{{"A":{{"0":"{hex}"}},"B":{{"0":1}}}}'.format(hex=hexed) + result = df_nonprintable.to_json(default_handler=str) + expected = f'{{"A":{{"0":"{hexed}"}}}}' + assert result == expected + assert ( + df_mixed.to_json(default_handler=str) + == f'{{"A":{{"0":"{hexed}"}},"B":{{"0":1}}}}' + ) def test_label_overflow(self): # GH14256: buffer length not checked when writing label - df = pd.DataFrame({"bar" * 100000: [1], "foo": [1337]}) - assert df.to_json() == '{{"{bar}":{{"0":1}},"foo":{{"0":1337}}}}'.format( - bar=("bar" * 100000) - ) + result = pd.DataFrame({"bar" * 100000: [1], "foo": [1337]}).to_json() + expected = f'{{"{"bar" * 100000}":{{"0":1}},"foo":{{"0":1337}}}}' + assert result == expected def test_series_non_unique_index(self): s = Series(["a", "b"], index=[1, 1]) @@ -652,8 +642,6 @@ def test_series_roundtrip_simple(self, orient, numpy): result = pd.read_json(data, typ="series", orient=orient, numpy=numpy) expected = self.series.copy() - if not numpy and PY35 and orient in ("index", "columns"): - expected = expected.sort_index() if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -670,8 +658,6 @@ def test_series_roundtrip_object(self, orient, numpy, dtype): ) expected = self.objSeries.copy() - if not numpy and PY35 and orient in ("index", "columns"): - expected = expected.sort_index() if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -686,8 +672,6 @@ def test_series_roundtrip_empty(self, orient, numpy): expected = self.empty_series.copy() # TODO: see what causes inconsistency - if not numpy and PY35 and orient == "index": - expected = expected.sort_index() if orient in ("values", "records"): expected = expected.reset_index(drop=True) else: @@ -871,7 +855,7 @@ def test_date_format_frame(self, date, date_unit): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) @@ -901,7 +885,7 @@ def test_date_format_series(self, date, date_unit): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -1261,7 +1245,7 @@ def test_to_jsonl(self): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(pd.read_json(result, lines=True), df) @@ -1446,7 +1430,7 @@ def test_read_timezone_information(self): ) def test_timedelta_as_label(self, date_format, key): df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) - expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + expected = f'{{"{key}":{{"0":1}}}}' result = df.to_json(date_format=date_format) assert result == expected @@ -1475,7 +1459,7 @@ def test_to_json_indent(self, indent): result = df.to_json(indent=indent) spaces = " " * indent - expected = """{{ + expected = f"""{{ {spaces}"a":{{ {spaces}{spaces}"0":"foo", {spaces}{spaces}"1":"baz" @@ -1484,9 +1468,7 @@ def test_to_json_indent(self, indent): {spaces}{spaces}"0":"bar", {spaces}{spaces}"1":"qux" {spaces}}} -}}""".format( - spaces=spaces - ) +}}""" assert result == expected @@ -1611,12 +1593,24 @@ def test_json_indent_all_orients(self, orient, expected): # GH 12004 df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["a", "b"]) result = df.to_json(orient=orient, indent=4) - - if PY35: - assert json.loads(result) == json.loads(expected) - else: - assert result == expected + assert result == expected def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) + + def test_emca_262_nan_inf_support(self): + # GH 12213 + data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' + result = pd.read_json(data) + expected = pd.DataFrame( + ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ) + tm.assert_frame_equal(result, expected) + + def test_deprecate_numpy_argument_read_json(self): + # GH 28512 + expected = DataFrame([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 05f97a1769205..e531457627342 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import DataFrame, read_json -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._json import JsonReader @@ -56,7 +56,7 @@ def test_to_jsonl(): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(read_json(result, lines=True), df) @@ -134,10 +134,7 @@ def test_readjson_chunks_closes(chunksize): reader.read() assert ( reader.open_stream.closed - ), "didn't close stream with \ - chunksize = {chunksize}".format( - chunksize=chunksize - ) + ), f"didn't close stream with chunksize = {chunksize}" @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) @@ -170,6 +167,15 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: test = pd.concat(test) - tm.assert_frame_equal( - orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize) - ) + tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") + + +def test_readjson_unicode(monkeypatch): + with tm.ensure_clean("test.json") as path: + monkeypatch.setattr("_bootlocale.getpreferredencoding", lambda l: "cp949") + with open(path, "w", encoding="utf-8") as f: + f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') + + result = read_json(path) + expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 20e2690084e2a..bedd60084124c 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -17,7 +17,7 @@ import pandas.compat as compat from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _clean_dict(d): @@ -111,9 +111,9 @@ def test_encode_decimal(self): @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"' + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' html_encoded = ( - '"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"' + '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' ) def helper(expected_output, **encode_kwargs): @@ -362,21 +362,21 @@ def test_encode_date_conversion(self): ) def test_encode_time_conversion_basic(self, test): output = ujson.encode(test) - expected = '"{iso}"'.format(iso=test.isoformat()) + expected = f'"{test.isoformat()}"' assert expected == output def test_encode_time_conversion_pytz(self): # see gh-11473: to_json segfaults with timezone-aware datetimes test = datetime.time(10, 12, 15, 343243, pytz.utc) output = ujson.encode(test) - expected = '"{iso}"'.format(iso=test.isoformat()) + expected = f'"{test.isoformat()}"' assert expected == output def test_encode_time_conversion_dateutil(self): # see gh-11473: to_json segfaults with timezone-aware datetimes test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc()) output = ujson.encode(test) - expected = '"{iso}"'.format(iso=test.isoformat()) + expected = f'"{test.isoformat()}"' assert expected == output @pytest.mark.parametrize( @@ -559,11 +559,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_version(self): - assert re.match( - r"^\d+\.\d+(\.\d+)?$", ujson.__version__ - ), "ujson.__version__ must be a string like '1.4.0'" - def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): ujson.encode(12839128391289382193812939) @@ -580,7 +575,7 @@ class Nested: def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the # sign bit set (2**31 <= x < 2**32) are decoded properly. - doc = '{{"id": {val}}}'.format(val=val) + doc = f'{{"id": {val}}}' assert ujson.decode(doc)["id"] == val def test_encode_big_escape(self): @@ -621,7 +616,7 @@ def __init__(self, val): def recursive_attr(self): return _TestObject("recursive_attr") - def __str__(self): + def __str__(self) -> str: return str(self.val) msg = "Maximum recursion level reached" @@ -761,8 +756,9 @@ def test_array_list(self): ["a", "b"], {"key": "val"}, ] - arr = np.array(arr_list) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + arr = np.array(arr_list, dtype=object) + result = np.array(ujson.decode(ujson.encode(arr)), dtype=object) + tm.assert_numpy_array_equal(result, arr) def test_array_float(self): dtype = np.float32 @@ -815,7 +811,7 @@ def test_array_numpy_labelled(self): # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order - input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]' + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, {"a": 2.4, "b": 78}]' output = ujson.loads(input_dumps, numpy=True, labelled=True) expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() diff --git a/pandas/tests/io/msgpack/common.py b/pandas/tests/io/msgpack/common.py deleted file mode 100644 index 60c1c0db18de8..0000000000000 --- a/pandas/tests/io/msgpack/common.py +++ /dev/null @@ -1,2 +0,0 @@ -frombytes = lambda obj, data: obj.frombytes(data) -tobytes = lambda obj: obj.tobytes() diff --git a/pandas/tests/io/msgpack/data/frame.mp b/pandas/tests/io/msgpack/data/frame.mp deleted file mode 100644 index 21e20d262b26c..0000000000000 Binary files a/pandas/tests/io/msgpack/data/frame.mp and /dev/null differ diff --git a/pandas/tests/io/msgpack/test_buffer.py b/pandas/tests/io/msgpack/test_buffer.py deleted file mode 100644 index fe1f4e73eba24..0000000000000 --- a/pandas/tests/io/msgpack/test_buffer.py +++ /dev/null @@ -1,22 +0,0 @@ -# coding: utf-8 - -from pandas.io.msgpack import packb, unpackb - -from .common import frombytes - - -def test_unpack_buffer(): - from array import array - - buf = array("b") - frombytes(buf, packb((b"foo", b"bar"))) - obj = unpackb(buf, use_list=1) - assert [b"foo", b"bar"] == obj - - -def test_unpack_bytearray(): - buf = bytearray(packb(("foo", "bar"))) - obj = unpackb(buf, use_list=1) - assert [b"foo", b"bar"] == obj - expected_type = bytes - assert all(type(s) == expected_type for s in obj) diff --git a/pandas/tests/io/msgpack/test_case.py b/pandas/tests/io/msgpack/test_case.py deleted file mode 100644 index a868da69d5459..0000000000000 --- a/pandas/tests/io/msgpack/test_case.py +++ /dev/null @@ -1,151 +0,0 @@ -# coding: utf-8 - -from pandas.io.msgpack import packb, unpackb - - -def check(length, obj): - v = packb(obj) - assert ( - len(v) == length - ), "{obj!r} length should be {length!r} but get {got:!r}".format( - obj=obj, length=length, got=len(v) - ) - assert unpackb(v, use_list=0) == obj - - -def test_1(): - for o in [ - None, - True, - False, - 0, - 1, - (1 << 6), - (1 << 7) - 1, - -1, - -((1 << 5) - 1), - -(1 << 5), - ]: - check(1, o) - - -def test_2(): - for o in [1 << 7, (1 << 8) - 1, -((1 << 5) + 1), -(1 << 7)]: - check(2, o) - - -def test_3(): - for o in [1 << 8, (1 << 16) - 1, -((1 << 7) + 1), -(1 << 15)]: - check(3, o) - - -def test_5(): - for o in [1 << 16, (1 << 32) - 1, -((1 << 15) + 1), -(1 << 31)]: - check(5, o) - - -def test_9(): - for o in [ - 1 << 32, - (1 << 64) - 1, - -((1 << 31) + 1), - -(1 << 63), - 1.0, - 0.1, - -0.1, - -1.0, - ]: - check(9, o) - - -def check_raw(overhead, num): - check(num + overhead, b" " * num) - - -def test_fixraw(): - check_raw(1, 0) - check_raw(1, (1 << 5) - 1) - - -def test_raw16(): - check_raw(3, 1 << 5) - check_raw(3, (1 << 16) - 1) - - -def test_raw32(): - check_raw(5, 1 << 16) - - -def check_array(overhead, num): - check(num + overhead, (None,) * num) - - -def test_fixarray(): - check_array(1, 0) - check_array(1, (1 << 4) - 1) - - -def test_array16(): - check_array(3, 1 << 4) - check_array(3, (1 << 16) - 1) - - -def test_array32(): - check_array(5, (1 << 16)) - - -def match(obj, buf): - assert packb(obj) == buf - assert unpackb(buf, use_list=0) == obj - - -def test_match(): - cases = [ - (None, b"\xc0"), - (False, b"\xc2"), - (True, b"\xc3"), - (0, b"\x00"), - (127, b"\x7f"), - (128, b"\xcc\x80"), - (256, b"\xcd\x01\x00"), - (-1, b"\xff"), - (-33, b"\xd0\xdf"), - (-129, b"\xd1\xff\x7f"), - ({1: 1}, b"\x81\x01\x01"), - (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"), - ((), b"\x90"), - ( - tuple(range(15)), - (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" b"\x0a\x0b\x0c\x0d\x0e"), - ), - ( - tuple(range(16)), - ( - b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07" - b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - ), - ), - ({}, b"\x80"), - ( - {x: x for x in range(15)}, - ( - b"\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07" - b"\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" - ), - ), - ( - {x: x for x in range(16)}, - ( - b"\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06" - b"\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" - b"\x0f\x0f" - ), - ), - ] - - for v, p in cases: - match(v, p) - - -def test_unicode(): - assert unpackb(packb("foobar"), use_list=1) == b"foobar" diff --git a/pandas/tests/io/msgpack/test_except.py b/pandas/tests/io/msgpack/test_except.py deleted file mode 100644 index 60c1dcca162a9..0000000000000 --- a/pandas/tests/io/msgpack/test_except.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 - -from datetime import datetime - -import pytest - -from pandas.io.msgpack import packb, unpackb - - -class DummyException(Exception): - pass - - -class TestExceptions: - def test_raise_on_find_unsupported_value(self): - msg = "can't serialize datetime" - with pytest.raises(TypeError, match=msg): - packb(datetime.now()) - - def test_raise_from_object_hook(self): - def hook(_): - raise DummyException() - - with pytest.raises(DummyException): - unpackb(packb({}), object_hook=hook) - with pytest.raises(DummyException): - unpackb(packb({"fizz": "buzz"}), object_hook=hook) - with pytest.raises(DummyException): - unpackb(packb({"fizz": "buzz"}), object_pairs_hook=hook) - with pytest.raises(DummyException): - unpackb(packb({"fizz": {"buzz": "spam"}}), object_hook=hook) - with pytest.raises(DummyException): - unpackb(packb({"fizz": {"buzz": "spam"}}), object_pairs_hook=hook) - - def test_invalid_value(self): - msg = "Unpack failed: error" - with pytest.raises(ValueError, match=msg): - unpackb(b"\xd9\x97#DL_") diff --git a/pandas/tests/io/msgpack/test_extension.py b/pandas/tests/io/msgpack/test_extension.py deleted file mode 100644 index 85ed43fa01079..0000000000000 --- a/pandas/tests/io/msgpack/test_extension.py +++ /dev/null @@ -1,63 +0,0 @@ -import array - -import pandas.io.msgpack as msgpack -from pandas.io.msgpack import ExtType - -from .common import frombytes, tobytes - - -def test_pack_ext_type(): - def p(s): - packer = msgpack.Packer() - packer.pack_ext_type(0x42, s) - return packer.bytes() - - assert p(b"A") == b"\xd4\x42A" # fixext 1 - assert p(b"AB") == b"\xd5\x42AB" # fixext 2 - assert p(b"ABCD") == b"\xd6\x42ABCD" # fixext 4 - assert p(b"ABCDEFGH") == b"\xd7\x42ABCDEFGH" # fixext 8 - assert p(b"A" * 16) == b"\xd8\x42" + b"A" * 16 # fixext 16 - assert p(b"ABC") == b"\xc7\x03\x42ABC" # ext 8 - assert p(b"A" * 0x0123) == b"\xc8\x01\x23\x42" + b"A" * 0x0123 # ext 16 - assert ( - p(b"A" * 0x00012345) == b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345 - ) # ext 32 - - -def test_unpack_ext_type(): - def check(b, expected): - assert msgpack.unpackb(b) == expected - - check(b"\xd4\x42A", ExtType(0x42, b"A")) # fixext 1 - check(b"\xd5\x42AB", ExtType(0x42, b"AB")) # fixext 2 - check(b"\xd6\x42ABCD", ExtType(0x42, b"ABCD")) # fixext 4 - check(b"\xd7\x42ABCDEFGH", ExtType(0x42, b"ABCDEFGH")) # fixext 8 - check(b"\xd8\x42" + b"A" * 16, ExtType(0x42, b"A" * 16)) # fixext 16 - check(b"\xc7\x03\x42ABC", ExtType(0x42, b"ABC")) # ext 8 - check(b"\xc8\x01\x23\x42" + b"A" * 0x0123, ExtType(0x42, b"A" * 0x0123)) # ext 16 - check( - b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345, - ExtType(0x42, b"A" * 0x00012345), - ) # ext 32 - - -def test_extension_type(): - def default(obj): - print("default called", obj) - if isinstance(obj, array.array): - typecode = 123 # application specific typecode - data = tobytes(obj) - return ExtType(typecode, data) - raise TypeError("Unknown type object {obj!r}".format(obj)) - - def ext_hook(code, data): - print("ext_hook called", code, data) - assert code == 123 - obj = array.array("d") - frombytes(obj, data) - return obj - - obj = [42, b"hello", array.array("d", [1.1, 2.2, 3.3])] - s = msgpack.packb(obj, default=default) - obj2 = msgpack.unpackb(s, ext_hook=ext_hook) - assert obj == obj2 diff --git a/pandas/tests/io/msgpack/test_format.py b/pandas/tests/io/msgpack/test_format.py deleted file mode 100644 index 46d0116bc3926..0000000000000 --- a/pandas/tests/io/msgpack/test_format.py +++ /dev/null @@ -1,84 +0,0 @@ -# coding: utf-8 - -from pandas.io.msgpack import unpackb - - -def check(src, should, use_list=0): - assert unpackb(src, use_list=use_list) == should - - -def testSimpleValue(): - check(b"\x93\xc0\xc2\xc3", (None, False, True)) - - -def testFixnum(): - check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0, 64, 127), (-32, -16, -1))) - - -def testFixArray(): - check(b"\x92\x90\x91\x91\xc0", ((), ((None,),))) - - -def testFixRaw(): - check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def")) - - -def testFixMap(): - check( - b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", {False: {None: None}, True: {None: {}}} - ) - - -def testUnsignedInt(): - check( - b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" - b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" - b"\xce\xff\xff\xff\xff", - (0, 128, 255, 0, 32768, 65535, 0, 2147483648, 4294967295), - ) - - -def testSignedInt(): - check( - b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" - b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" - b"\xd2\xff\xff\xff\xff", - (0, -128, -1, 0, -32768, -1, 0, -2147483648, -1), - ) - - -def testRaw(): - check( - b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" - b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", - (b"", b"a", b"ab", b"", b"a", b"ab"), - ) - - -def testArray(): - check( - b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" - b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" - b"\xc2\xc3", - ((), (None,), (False, True), (), (None,), (False, True)), - ) - - -def testMap(): - check( - b"\x96" - b"\xde\x00\x00" - b"\xde\x00\x01\xc0\xc2" - b"\xde\x00\x02\xc0\xc2\xc3\xc2" - b"\xdf\x00\x00\x00\x00" - b"\xdf\x00\x00\x00\x01\xc0\xc2" - b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", - ( - {}, - {None: False}, - {True: False, None: False}, - {}, - {None: False}, - {True: False, None: False}, - ), - ) diff --git a/pandas/tests/io/msgpack/test_limits.py b/pandas/tests/io/msgpack/test_limits.py deleted file mode 100644 index 4c0697f8faf64..0000000000000 --- a/pandas/tests/io/msgpack/test_limits.py +++ /dev/null @@ -1,107 +0,0 @@ -# coding: utf-8 -import pytest - -from pandas.io.msgpack import ExtType, Packer, Unpacker, packb, unpackb - - -class TestLimits: - def test_integer(self): - x = -(2 ** 63) - assert unpackb(packb(x)) == x - msg = ( - r"((long |Python )?(int )?too (big|large) to convert" - r"( to C (unsigned )?long))?" - ) - with pytest.raises((OverflowError, ValueError), match=msg): - packb(x - 1) - x = 2 ** 64 - 1 - assert unpackb(packb(x)) == x - with pytest.raises((OverflowError, ValueError), match=msg): - packb(x + 1) - - def test_array_header(self): - packer = Packer() - packer.pack_array_header(2 ** 32 - 1) - with pytest.raises((OverflowError, ValueError)): - packer.pack_array_header(2 ** 32) - - def test_map_header(self): - packer = Packer() - packer.pack_map_header(2 ** 32 - 1) - with pytest.raises((OverflowError, ValueError)): - packer.pack_array_header(2 ** 32) - - def test_max_str_len(self): - d = "x" * 3 - packed = packb(d) - - unpacker = Unpacker(max_str_len=3, encoding="utf-8") - unpacker.feed(packed) - assert unpacker.unpack() == d - - unpacker = Unpacker(max_str_len=2, encoding="utf-8") - unpacker.feed(packed) - - msg = "3 exceeds max_str_len" - with pytest.raises(ValueError, match=msg): - unpacker.unpack() - - def test_max_bin_len(self): - d = b"x" * 3 - packed = packb(d, use_bin_type=True) - - unpacker = Unpacker(max_bin_len=3) - unpacker.feed(packed) - assert unpacker.unpack() == d - - unpacker = Unpacker(max_bin_len=2) - unpacker.feed(packed) - - msg = "3 exceeds max_bin_len" - with pytest.raises(ValueError, match=msg): - unpacker.unpack() - - def test_max_array_len(self): - d = [1, 2, 3] - packed = packb(d) - - unpacker = Unpacker(max_array_len=3) - unpacker.feed(packed) - assert unpacker.unpack() == d - - unpacker = Unpacker(max_array_len=2) - unpacker.feed(packed) - - msg = "3 exceeds max_array_len" - with pytest.raises(ValueError, match=msg): - unpacker.unpack() - - def test_max_map_len(self): - d = {1: 2, 3: 4, 5: 6} - packed = packb(d) - - unpacker = Unpacker(max_map_len=3) - unpacker.feed(packed) - assert unpacker.unpack() == d - - unpacker = Unpacker(max_map_len=2) - unpacker.feed(packed) - - msg = "3 exceeds max_map_len" - with pytest.raises(ValueError, match=msg): - unpacker.unpack() - - def test_max_ext_len(self): - d = ExtType(42, b"abc") - packed = packb(d) - - unpacker = Unpacker(max_ext_len=3) - unpacker.feed(packed) - assert unpacker.unpack() == d - - unpacker = Unpacker(max_ext_len=2) - unpacker.feed(packed) - - msg = "4 exceeds max_ext_len" - with pytest.raises(ValueError, match=msg): - unpacker.unpack() diff --git a/pandas/tests/io/msgpack/test_newspec.py b/pandas/tests/io/msgpack/test_newspec.py deleted file mode 100644 index a1cf966b9d253..0000000000000 --- a/pandas/tests/io/msgpack/test_newspec.py +++ /dev/null @@ -1,90 +0,0 @@ -# coding: utf-8 - -from pandas.io.msgpack import ExtType, packb, unpackb - - -def test_str8(): - header = b"\xd9" - data = b"x" * 32 - b = packb(data.decode(), use_bin_type=True) - assert len(b) == len(data) + 2 - assert b[0:2] == header + b"\x20" - assert b[2:] == data - assert unpackb(b) == data - - data = b"x" * 255 - b = packb(data.decode(), use_bin_type=True) - assert len(b) == len(data) + 2 - assert b[0:2] == header + b"\xff" - assert b[2:] == data - assert unpackb(b) == data - - -def test_bin8(): - header = b"\xc4" - data = b"" - b = packb(data, use_bin_type=True) - assert len(b) == len(data) + 2 - assert b[0:2] == header + b"\x00" - assert b[2:] == data - assert unpackb(b) == data - - data = b"x" * 255 - b = packb(data, use_bin_type=True) - assert len(b) == len(data) + 2 - assert b[0:2] == header + b"\xff" - assert b[2:] == data - assert unpackb(b) == data - - -def test_bin16(): - header = b"\xc5" - data = b"x" * 256 - b = packb(data, use_bin_type=True) - assert len(b) == len(data) + 3 - assert b[0:1] == header - assert b[1:3] == b"\x01\x00" - assert b[3:] == data - assert unpackb(b) == data - - data = b"x" * 65535 - b = packb(data, use_bin_type=True) - assert len(b) == len(data) + 3 - assert b[0:1] == header - assert b[1:3] == b"\xff\xff" - assert b[3:] == data - assert unpackb(b) == data - - -def test_bin32(): - header = b"\xc6" - data = b"x" * 65536 - b = packb(data, use_bin_type=True) - assert len(b) == len(data) + 5 - assert b[0:1] == header - assert b[1:5] == b"\x00\x01\x00\x00" - assert b[5:] == data - assert unpackb(b) == data - - -def test_ext(): - def check(ext, packed): - assert packb(ext) == packed - assert unpackb(packed) == ext - - check(ExtType(0x42, b"Z"), b"\xd4\x42Z") # fixext 1 - check(ExtType(0x42, b"ZZ"), b"\xd5\x42ZZ") # fixext 2 - check(ExtType(0x42, b"Z" * 4), b"\xd6\x42" + b"Z" * 4) # fixext 4 - check(ExtType(0x42, b"Z" * 8), b"\xd7\x42" + b"Z" * 8) # fixext 8 - check(ExtType(0x42, b"Z" * 16), b"\xd8\x42" + b"Z" * 16) # fixext 16 - # ext 8 - check(ExtType(0x42, b""), b"\xc7\x00\x42") - check(ExtType(0x42, b"Z" * 255), b"\xc7\xff\x42" + b"Z" * 255) - # ext 16 - check(ExtType(0x42, b"Z" * 256), b"\xc8\x01\x00\x42" + b"Z" * 256) - check(ExtType(0x42, b"Z" * 0xFFFF), b"\xc8\xff\xff\x42" + b"Z" * 0xFFFF) - # ext 32 - check(ExtType(0x42, b"Z" * 0x10000), b"\xc9\x00\x01\x00\x00\x42" + b"Z" * 0x10000) - # needs large memory - # check(ExtType(0x42, b'Z'*0xffffffff), - # b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff) diff --git a/pandas/tests/io/msgpack/test_obj.py b/pandas/tests/io/msgpack/test_obj.py deleted file mode 100644 index 03d8807c0922c..0000000000000 --- a/pandas/tests/io/msgpack/test_obj.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 - -import pytest - -from pandas.io.msgpack import packb, unpackb - - -class DecodeError(Exception): - pass - - -class TestObj: - def _arr_to_str(self, arr): - return "".join(str(c) for c in arr) - - def bad_complex_decoder(self, o): - raise DecodeError("Ooops!") - - def _decode_complex(self, obj): - if b"__complex__" in obj: - return complex(obj[b"real"], obj[b"imag"]) - return obj - - def _encode_complex(self, obj): - if isinstance(obj, complex): - return {b"__complex__": True, b"real": 1, b"imag": 2} - return obj - - def test_encode_hook(self): - packed = packb([3, 1 + 2j], default=self._encode_complex) - unpacked = unpackb(packed, use_list=1) - assert unpacked[1] == {b"__complex__": True, b"real": 1, b"imag": 2} - - def test_decode_hook(self): - packed = packb([3, {b"__complex__": True, b"real": 1, b"imag": 2}]) - unpacked = unpackb(packed, object_hook=self._decode_complex, use_list=1) - assert unpacked[1] == 1 + 2j - - def test_decode_pairs_hook(self): - packed = packb([3, {1: 2, 3: 4}]) - prod_sum = 1 * 2 + 3 * 4 - unpacked = unpackb( - packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), use_list=1 - ) - assert unpacked[1] == prod_sum - - def test_only_one_obj_hook(self): - msg = "object_pairs_hook and object_hook are mutually exclusive" - with pytest.raises(TypeError, match=msg): - unpackb(b"", object_hook=lambda x: x, object_pairs_hook=lambda x: x) - - def test_bad_hook(self): - msg = r"can't serialize \(1\+2j\)" - with pytest.raises(TypeError, match=msg): - packed = packb([3, 1 + 2j], default=lambda o: o) - unpacked = unpackb(packed, use_list=1) # noqa - - def test_array_hook(self): - packed = packb([1, 2, 3]) - unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1) - assert unpacked == "123" - - def test_an_exception_in_objecthook1(self): - with pytest.raises(DecodeError, match="Ooops!"): - packed = packb({1: {"__complex__": True, "real": 1, "imag": 2}}) - unpackb(packed, object_hook=self.bad_complex_decoder) - - def test_an_exception_in_objecthook2(self): - with pytest.raises(DecodeError, match="Ooops!"): - packed = packb({1: [{"__complex__": True, "real": 1, "imag": 2}]}) - unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1) diff --git a/pandas/tests/io/msgpack/test_pack.py b/pandas/tests/io/msgpack/test_pack.py deleted file mode 100644 index 5fc24027589cb..0000000000000 --- a/pandas/tests/io/msgpack/test_pack.py +++ /dev/null @@ -1,171 +0,0 @@ -# coding: utf-8 -from collections import OrderedDict -from io import BytesIO -import struct - -import pytest - -from pandas.io.msgpack import Packer, Unpacker, packb, unpackb - - -class TestPack: - def check(self, data, use_list=False): - re = unpackb(packb(data), use_list=use_list) - assert re == data - - def testPack(self): - test_data = [ - 0, - 1, - 127, - 128, - 255, - 256, - 65535, - 65536, - -1, - -32, - -33, - -128, - -129, - -32768, - -32769, - 1.0, - b"", - b"a", - b"a" * 31, - b"a" * 32, - None, - True, - False, - (), - ((),), - ((), None), - {None: 0}, - (1 << 23), - ] - for td in test_data: - self.check(td) - - def testPackUnicode(self): - test_data = ["", "abcd", ["defgh"], "Русский текст"] - for td in test_data: - re = unpackb(packb(td, encoding="utf-8"), use_list=1, encoding="utf-8") - assert re == td - packer = Packer(encoding="utf-8") - data = packer.pack(td) - re = Unpacker(BytesIO(data), encoding="utf-8", use_list=1).unpack() - assert re == td - - def testPackUTF32(self): - test_data = ["", "abcd", ["defgh"], "Русский текст"] - for td in test_data: - re = unpackb(packb(td, encoding="utf-32"), use_list=1, encoding="utf-32") - assert re == td - - def testPackBytes(self): - test_data = [b"", b"abcd", (b"defgh",)] - for td in test_data: - self.check(td) - - def testIgnoreUnicodeErrors(self): - re = unpackb( - packb(b"abc\xeddef"), encoding="utf-8", unicode_errors="ignore", use_list=1 - ) - assert re == "abcdef" - - def testStrictUnicodeUnpack(self): - msg = ( - r"'utf-*8' codec can't decode byte 0xed in position 3:" - " invalid continuation byte" - ) - with pytest.raises(UnicodeDecodeError, match=msg): - unpackb(packb(b"abc\xeddef"), encoding="utf-8", use_list=1) - - def testStrictUnicodePack(self): - msg = ( - r"'ascii' codec can't encode character '\\xed' in position 3:" - r" ordinal not in range\(128\)" - ) - with pytest.raises(UnicodeEncodeError, match=msg): - packb("abc\xeddef", encoding="ascii", unicode_errors="strict") - - def testIgnoreErrorsPack(self): - re = unpackb( - packb("abcФФФdef", encoding="ascii", unicode_errors="ignore"), - encoding="utf-8", - use_list=1, - ) - assert re == "abcdef" - - def testNoEncoding(self): - msg = "Can't encode unicode string: no encoding is specified" - with pytest.raises(TypeError, match=msg): - packb("abc", encoding=None) - - def testDecodeBinary(self): - re = unpackb(packb("abc"), encoding=None, use_list=1) - assert re == b"abc" - - def testPackFloat(self): - assert packb(1.0, use_single_float=True) == b"\xca" + struct.pack(">f", 1.0) - assert packb(1.0, use_single_float=False) == b"\xcb" + struct.pack(">d", 1.0) - - def testArraySize(self, sizes=[0, 5, 50, 1000]): - bio = BytesIO() - packer = Packer() - for size in sizes: - bio.write(packer.pack_array_header(size)) - for i in range(size): - bio.write(packer.pack(i)) - - bio.seek(0) - unpacker = Unpacker(bio, use_list=1) - for size in sizes: - assert unpacker.unpack() == list(range(size)) - - def test_manualreset(self, sizes=[0, 5, 50, 1000]): - packer = Packer(autoreset=False) - for size in sizes: - packer.pack_array_header(size) - for i in range(size): - packer.pack(i) - - bio = BytesIO(packer.bytes()) - unpacker = Unpacker(bio, use_list=1) - for size in sizes: - assert unpacker.unpack() == list(range(size)) - - packer.reset() - assert packer.bytes() == b"" - - def testMapSize(self, sizes=[0, 5, 50, 1000]): - bio = BytesIO() - packer = Packer() - for size in sizes: - bio.write(packer.pack_map_header(size)) - for i in range(size): - bio.write(packer.pack(i)) # key - bio.write(packer.pack(i * 2)) # value - - bio.seek(0) - unpacker = Unpacker(bio) - for size in sizes: - assert unpacker.unpack() == {i: i * 2 for i in range(size)} - - def test_odict(self): - seq = [(b"one", 1), (b"two", 2), (b"three", 3), (b"four", 4)] - od = OrderedDict(seq) - assert unpackb(packb(od), use_list=1) == dict(seq) - - def pair_hook(seq): - return list(seq) - - assert unpackb(packb(od), object_pairs_hook=pair_hook, use_list=1) == seq - - def test_pairlist(self): - pairlist = [(b"a", 1), (2, b"b"), (b"foo", b"bar")] - packer = Packer() - packed = packer.pack_map_pairs(pairlist) - unpacked = unpackb(packed, object_pairs_hook=list) - assert pairlist == unpacked diff --git a/pandas/tests/io/msgpack/test_read_size.py b/pandas/tests/io/msgpack/test_read_size.py deleted file mode 100644 index 7d2b539f12085..0000000000000 --- a/pandas/tests/io/msgpack/test_read_size.py +++ /dev/null @@ -1,71 +0,0 @@ -"""Test Unpacker's read_array_header and read_map_header methods""" -from pandas.io.msgpack import OutOfData, Unpacker, packb - -UnexpectedTypeException = ValueError - - -def test_read_array_header(): - unpacker = Unpacker() - unpacker.feed(packb(["a", "b", "c"])) - assert unpacker.read_array_header() == 3 - assert unpacker.unpack() == b"a" - assert unpacker.unpack() == b"b" - assert unpacker.unpack() == b"c" - try: - unpacker.unpack() - assert 0, "should raise exception" - except OutOfData: - assert 1, "okay" - - -def test_read_map_header(): - unpacker = Unpacker() - unpacker.feed(packb({"a": "A"})) - assert unpacker.read_map_header() == 1 - assert unpacker.unpack() == b"a" - assert unpacker.unpack() == b"A" - try: - unpacker.unpack() - assert 0, "should raise exception" - except OutOfData: - assert 1, "okay" - - -def test_incorrect_type_array(): - unpacker = Unpacker() - unpacker.feed(packb(1)) - try: - unpacker.read_array_header() - assert 0, "should raise exception" - except UnexpectedTypeException: - assert 1, "okay" - - -def test_incorrect_type_map(): - unpacker = Unpacker() - unpacker.feed(packb(1)) - try: - unpacker.read_map_header() - assert 0, "should raise exception" - except UnexpectedTypeException: - assert 1, "okay" - - -def test_correct_type_nested_array(): - unpacker = Unpacker() - unpacker.feed(packb({"a": ["b", "c", "d"]})) - try: - unpacker.read_array_header() - assert 0, "should raise exception" - except UnexpectedTypeException: - assert 1, "okay" - - -def test_incorrect_type_nested_map(): - unpacker = Unpacker() - unpacker.feed(packb([{"a": "b"}])) - try: - unpacker.read_map_header() - assert 0, "should raise exception" - except UnexpectedTypeException: - assert 1, "okay" diff --git a/pandas/tests/io/msgpack/test_seq.py b/pandas/tests/io/msgpack/test_seq.py deleted file mode 100644 index c4ac13980bc67..0000000000000 --- a/pandas/tests/io/msgpack/test_seq.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 - -import io - -import pandas.io.msgpack as msgpack - -binarydata = bytes(bytearray(range(256))) - - -def gen_binary_data(idx): - return binarydata[: idx % 300] - - -def test_exceeding_unpacker_read_size(): - dumpf = io.BytesIO() - - packer = msgpack.Packer() - - NUMBER_OF_STRINGS = 6 - read_size = 16 - - # 5 ok for read_size=16, while 6 glibc detected *** python: double free or - # corruption (fasttop): - - # 20 ok for read_size=256, while 25 segfaults / glibc detected *** python: - # double free or corruption (!prev) - - # 40 ok for read_size=1024, while 50 introduces errors - # 7000 ok for read_size=1024*1024, while 8000 leads to glibc detected *** - # python: double free or corruption (!prev): - - for idx in range(NUMBER_OF_STRINGS): - data = gen_binary_data(idx) - dumpf.write(packer.pack(data)) - - f = io.BytesIO(dumpf.getvalue()) - dumpf.close() - - unpacker = msgpack.Unpacker(f, read_size=read_size, use_list=1) - - read_count = 0 - for idx, o in enumerate(unpacker): - assert type(o) == bytes - assert o == gen_binary_data(idx) - read_count += 1 - - assert read_count == NUMBER_OF_STRINGS diff --git a/pandas/tests/io/msgpack/test_sequnpack.py b/pandas/tests/io/msgpack/test_sequnpack.py deleted file mode 100644 index 79feb78b3b013..0000000000000 --- a/pandas/tests/io/msgpack/test_sequnpack.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding: utf-8 -from io import BytesIO - -import pytest - -from pandas.io.msgpack import BufferFull, OutOfData, Unpacker - - -class TestPack: - def test_partial_data(self): - unpacker = Unpacker() - msg = "No more data to unpack" - - for data in [b"\xa5", b"h", b"a", b"l", b"l"]: - unpacker.feed(data) - with pytest.raises(StopIteration, match=msg): - next(iter(unpacker)) - - unpacker.feed(b"o") - assert next(iter(unpacker)) == b"hallo" - - def test_foobar(self): - unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b"foobar") - assert unpacker.unpack() == ord(b"f") - assert unpacker.unpack() == ord(b"o") - assert unpacker.unpack() == ord(b"o") - assert unpacker.unpack() == ord(b"b") - assert unpacker.unpack() == ord(b"a") - assert unpacker.unpack() == ord(b"r") - msg = "No more data to unpack" - with pytest.raises(OutOfData, match=msg): - unpacker.unpack() - - unpacker.feed(b"foo") - unpacker.feed(b"bar") - - k = 0 - for o, e in zip(unpacker, "foobarbaz"): - assert o == ord(e) - k += 1 - assert k == len(b"foobar") - - def test_foobar_skip(self): - unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b"foobar") - assert unpacker.unpack() == ord(b"f") - unpacker.skip() - assert unpacker.unpack() == ord(b"o") - unpacker.skip() - assert unpacker.unpack() == ord(b"a") - unpacker.skip() - msg = "No more data to unpack" - with pytest.raises(OutOfData, match=msg): - unpacker.unpack() - - def test_maxbuffersize_read_size_exceeds_max_buffer_size(self): - msg = "read_size should be less or equal to max_buffer_size" - with pytest.raises(ValueError, match=msg): - Unpacker(read_size=5, max_buffer_size=3) - - def test_maxbuffersize_bufferfull(self): - unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b"foo") - with pytest.raises(BufferFull, match=r"^$"): - unpacker.feed(b"b") - - def test_maxbuffersize(self): - unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b"foo") - assert ord("f") == next(unpacker) - unpacker.feed(b"b") - assert ord("o") == next(unpacker) - assert ord("o") == next(unpacker) - assert ord("b") == next(unpacker) - - def test_readbytes(self): - unpacker = Unpacker(read_size=3) - unpacker.feed(b"foobar") - assert unpacker.unpack() == ord(b"f") - assert unpacker.read_bytes(3) == b"oob" - assert unpacker.unpack() == ord(b"a") - assert unpacker.unpack() == ord(b"r") - - # Test buffer refill - unpacker = Unpacker(BytesIO(b"foobar"), read_size=3) - assert unpacker.unpack() == ord(b"f") - assert unpacker.read_bytes(3) == b"oob" - assert unpacker.unpack() == ord(b"a") - assert unpacker.unpack() == ord(b"r") - - def test_issue124(self): - unpacker = Unpacker() - unpacker.feed(b"\xa1?\xa1!") - assert tuple(unpacker) == (b"?", b"!") - assert tuple(unpacker) == () - unpacker.feed(b"\xa1?\xa1") - assert tuple(unpacker) == (b"?",) - assert tuple(unpacker) == () - unpacker.feed(b"!") - assert tuple(unpacker) == (b"!",) - assert tuple(unpacker) == () diff --git a/pandas/tests/io/msgpack/test_subtype.py b/pandas/tests/io/msgpack/test_subtype.py deleted file mode 100644 index c82f6f6d3bf4e..0000000000000 --- a/pandas/tests/io/msgpack/test_subtype.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf-8 - -from collections import namedtuple - -from pandas.io.msgpack import packb - - -class MyList(list): - pass - - -class MyDict(dict): - pass - - -class MyTuple(tuple): - pass - - -MyNamedTuple = namedtuple("MyNamedTuple", "x y") - - -def test_types(): - assert packb(MyDict()) == packb(dict()) - assert packb(MyList()) == packb(list()) - assert packb(MyNamedTuple(1, 2)) == packb((1, 2)) diff --git a/pandas/tests/io/msgpack/test_unpack.py b/pandas/tests/io/msgpack/test_unpack.py deleted file mode 100644 index 483e09efe6bb8..0000000000000 --- a/pandas/tests/io/msgpack/test_unpack.py +++ /dev/null @@ -1,64 +0,0 @@ -from io import BytesIO -import sys - -import pytest - -from pandas.io.msgpack import ExtType, OutOfData, Unpacker, packb - - -class TestUnpack: - def test_unpack_array_header_from_file(self): - f = BytesIO(packb([1, 2, 3, 4])) - unpacker = Unpacker(f) - assert unpacker.read_array_header() == 4 - assert unpacker.unpack() == 1 - assert unpacker.unpack() == 2 - assert unpacker.unpack() == 3 - assert unpacker.unpack() == 4 - msg = "No more data to unpack" - with pytest.raises(OutOfData, match=msg): - unpacker.unpack() - - def test_unpacker_hook_refcnt(self): - if not hasattr(sys, "getrefcount"): - pytest.skip("no sys.getrefcount()") - result = [] - - def hook(x): - result.append(x) - return x - - basecnt = sys.getrefcount(hook) - - up = Unpacker(object_hook=hook, list_hook=hook) - - assert sys.getrefcount(hook) >= basecnt + 2 - - up.feed(packb([{}])) - up.feed(packb([{}])) - assert up.unpack() == [{}] - assert up.unpack() == [{}] - assert result == [{}, [{}], {}, [{}]] - - del up - - assert sys.getrefcount(hook) == basecnt - - def test_unpacker_ext_hook(self): - class MyUnpacker(Unpacker): - def __init__(self): - super().__init__(ext_hook=self._hook, encoding="utf-8") - - def _hook(self, code, data): - if code == 1: - return int(data) - else: - return ExtType(code, data) - - unpacker = MyUnpacker() - unpacker.feed(packb({"a": 1}, encoding="utf-8")) - assert unpacker.unpack() == {"a": 1} - unpacker.feed(packb({"a": ExtType(1, b"123")}, encoding="utf-8")) - assert unpacker.unpack() == {"a": 123} - unpacker.feed(packb({"a": ExtType(2, b"321")}, encoding="utf-8")) - assert unpacker.unpack() == {"a": ExtType(2, b"321")} diff --git a/pandas/tests/io/msgpack/test_unpack_raw.py b/pandas/tests/io/msgpack/test_unpack_raw.py deleted file mode 100644 index f844553bfc34a..0000000000000 --- a/pandas/tests/io/msgpack/test_unpack_raw.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Tests for cases where the user seeks to obtain packed msgpack objects""" - -import io - -from pandas.io.msgpack import Unpacker, packb - - -def test_write_bytes(): - unpacker = Unpacker() - unpacker.feed(b"abc") - f = io.BytesIO() - assert unpacker.unpack(f.write) == ord("a") - assert f.getvalue() == b"a" - f = io.BytesIO() - assert unpacker.skip(f.write) is None - assert f.getvalue() == b"b" - f = io.BytesIO() - assert unpacker.skip() is None - assert f.getvalue() == b"" - - -def test_write_bytes_multi_buffer(): - long_val = (5) * 100 - expected = packb(long_val) - unpacker = Unpacker(io.BytesIO(expected), read_size=3, max_buffer_size=3) - - f = io.BytesIO() - unpacked = unpacker.unpack(f.write) - assert unpacked == long_val - assert f.getvalue() == expected diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 183ad500b15f3..15967e3be176a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -7,9 +7,9 @@ class BaseParser: - engine = None # type: Optional[str] + engine: Optional[str] = None low_memory = True - float_precision_choices = [] # type: List[Optional[str]] + float_precision_choices: List[Optional[str]] = [] def update_kwargs(self, kwargs): kwargs = kwargs.copy() @@ -46,11 +46,17 @@ class PythonParser(BaseParser): @pytest.fixture def csv_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ return datapath("io", "parser", "data") @pytest.fixture def csv1(csv_dir_path): + """ + The path to the data file "test1.csv" needed for parser tests. + """ return os.path.join(csv_dir_path, "test1.csv") @@ -69,14 +75,49 @@ def csv1(csv_dir_path): @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): + """ + Fixture all of the CSV parsers. + """ return request.param @pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): + """ + Fixture all of the CSV parsers using the C engine. + """ return request.param @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): + """ + Fixture all of the CSV parsers using the Python engine. + """ + return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ return request.param diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip new file mode 100644 index 0000000000000..9a6d5c08da9db Binary files /dev/null and b/pandas/tests/io/parser/data/utf32_ex_small.zip differ diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip new file mode 100644 index 0000000000000..a4c5440bdffa7 Binary files /dev/null and b/pandas/tests/io/parser/data/utf8_ex_small.zip differ diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 77b52eb90d61f..1737f14e7adf9 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -597,3 +597,14 @@ def test_file_binary_mode(c_parser_only): with open(path, "rb") as f: result = parser.read_csv(f, header=None) tm.assert_frame_equal(result, expected) + + +def test_unix_style_breaks(c_parser_only): + # GH 11020 + parser = c_parser_only + with tm.ensure_clean() as path: + with open(path, "w", newline="\n") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + expected = DataFrame(columns=["col_1", "col_2", "col_3"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index e1d422142ab0b..60e32d7c27200 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("na_values", [None, ["NaN"]]) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6e6c31bc5b972..4c02a37b66455 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2,12 +2,10 @@ Tests that work on both the Python and C engines but do not have a specific classification into the other test modules. """ - import codecs -from collections import OrderedDict import csv from datetime import datetime -from io import BytesIO, StringIO +from io import StringIO import os import platform from tempfile import TemporaryFile @@ -20,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas import DataFrame, Index, MultiIndex, Series, compat, concat -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -71,17 +69,6 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_bytes_io_input(all_parsers): - encoding = "cp1255" - parser = all_parsers - - data = BytesIO("שלום:1234\n562:123".encode(encoding)) - result = parser.read_csv(data, sep=":", encoding=encoding) - - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_empty_decimal_marker(all_parsers): data = """A|B|C 1|2,334|5 @@ -318,15 +305,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_unicode(all_parsers): - parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) - - result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) - expected = DataFrame([["\u0141aski, Jan", 1]]) - tm.assert_frame_equal(result, expected) - - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. data = """A,B,C,D,E,F @@ -978,15 +956,15 @@ def test_path_local_path(all_parsers): def test_nonexistent_path(all_parsers): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError + # GH#29233 "File foo" instead of "File b'foo'" parser = all_parsers path = "{}.csv".format(tm.rands(10)) - msg = "does not exist" if parser.engine == "c" else r"\[Errno 2\]" + msg = f"File {path} does not exist" if parser.engine == "c" else r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) filename = e.value.filename - filename = filename.decode() if isinstance(filename, bytes) else filename assert path == filename @@ -1066,59 +1044,6 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", [",", "\t"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) -def test_utf16_bom_skiprows(all_parsers, sep, encoding): - # see gh-2298 - parser = all_parsers - data = """skip this -skip this too -A,B,C -1,2,3 -4,5,6""".replace( - ",", sep - ) - path = "__{}__.csv".format(tm.rands(10)) - kwargs = dict(sep=sep, skiprows=2) - utf8 = "utf-8" - - with tm.ensure_clean(path) as path: - from io import TextIOWrapper - - bytes_data = data.encode(encoding) - - with open(path, "wb") as f: - f.write(bytes_data) - - bytes_buffer = BytesIO(data.encode(utf8)) - bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) - - result = parser.read_csv(path, encoding=encoding, **kwargs) - expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) - - bytes_buffer.close() - tm.assert_frame_equal(result, expected) - - -def test_utf16_example(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - result = parser.read_csv(path, encoding="utf-16", sep="\t") - assert len(result) == 50 - - -def test_unicode_encoding(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - - result = parser.read_csv(path, header=None, encoding="latin-1") - result = result.set_index(0) - got = result[1][1632] - - expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" - assert got == expected - - def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1133,7 +1058,7 @@ def test_trailing_delimiters(all_parsers): def test_escapechar(all_parsers): - # http://stackoverflow.com/questions/13824840/feature-request-for- + # https://stackoverflow.com/questions/13824840/feature-request-for- # pandas-read-csv data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" @@ -1145,9 +1070,8 @@ def test_escapechar(all_parsers): StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" ) - assert result["SEARCH_TERM"][2] == ( - 'SLAGBORD, "Bergslagen", ' "IKEA:s 1700-tals serie" - ) + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) @@ -1318,9 +1242,7 @@ def test_float_parser(all_parsers): def test_scientific_no_exponent(all_parsers): # see gh-12215 - df = DataFrame.from_dict( - OrderedDict([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) - ) + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) parser = all_parsers @@ -1920,39 +1842,6 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - # Test in empty data row without skipping - ( - "\n1", - dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]}), - ), - ], -) -def test_utf8_bom(all_parsers, data, kwargs, expected): - # see gh-4793 - parser = all_parsers - bom = "\ufeff" - utf8 = "utf-8" - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers @@ -1970,20 +1859,6 @@ def test_temporary_file(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) -def test_read_csv_utf_aliases(all_parsers, byte, fmt): - # see gh-13549 - expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) - parser = all_parsers - - encoding = fmt.format(byte) - data = "mb_num,multibyte\n4.8,test".encode(encoding) - - result = parser.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) - - def test_internal_eof_byte(all_parsers): # see gh-5500 parser = all_parsers @@ -2043,30 +1918,6 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed -@pytest.mark.parametrize( - "fname,encoding", - [ - ("test1.csv", "utf-8"), - ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis"), - ], -) -def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): - # gh-23779: Python csv engine shouldn't error on files opened in binary. - parser = all_parsers - - fpath = os.path.join(csv_dir_path, fname) - expected = parser.read_csv(fpath, encoding=encoding) - - with open(fpath, mode="r", encoding=encoding) as fa: - result = parser.read_csv(fa) - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb") as fb: - result = parser.read_csv(fb, encoding=encoding) - tm.assert_frame_equal(expected, result) - - def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: @@ -2160,10 +2011,6 @@ def test_suppress_error_output(all_parsers, capsys): assert captured.err == "" -@pytest.mark.skipif( - compat.is_platform_windows() and not compat.PY36, - reason="On Python < 3.6 won't pass on Windows", -) @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"]) def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. @@ -2213,3 +2060,13 @@ def test_first_row_bom(all_parsers): result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 9d0eab0b9a907..dc03370daa1e2 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -9,7 +9,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf16_encoding(all_parsers, csv_dir_path): - # see gh-18071 +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 parser = all_parsers - path = os.path.join(csv_dir_path, "utf16_ex_small.zip") + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") expected = pd.DataFrame( { "Country": ["Venezuela", "Venezuela"], diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 2a3b1dc82fc59..88b400d9a11df 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm def test_converters_type_must_be_dict(all_parsers): diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index dc10352bc6460..cc65def0fd096 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -11,7 +11,7 @@ from pandas.errors import ParserWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index a68d46e8a6c15..2133f8116a95e 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("dtype", [str, object]) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000000000..33abf4bb7d9ee --- /dev/null +++ b/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,172 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" + +from io import BytesIO +import os +import tempfile + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = "__{}__.csv".format(tm.rands(10)) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + from io import TextIOWrapper + + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = encoding_fmt.format(utf_value) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + expected = DataFrame({"foo": ["bar"]}) + + with tempfile.TemporaryFile(mode="w+", encoding=encoding) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 0ecd8be7ddc78..7dc106ef0c186 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_read_with_bad_header(all_parsers): @@ -540,3 +540,34 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) expected = DataFrame([[2, 3], [4, 5]], columns=columns) tm.assert_frame_equal(result, expected) + + +def test_read_csv_multiindex_columns(all_parsers): + # GH#6051 + parser = all_parsers + + s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81" + s2 = ( + "Male, Male, Male, Female, Female\n" + "R, R, L, R, R\n" + ".86, .67, .88, .78, .81\n" + ".86, .67, .88, .78, .82" + ) + + mi = MultiIndex.from_tuples( + [ + ("Male", "R"), + (" Male", " R"), + (" Male", " L"), + (" Female", " R"), + (" Female", " R.1"), + ] + ) + expected = DataFrame( + [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi + ) + + df1 = parser.read_csv(StringIO(s1), header=[0, 1]) + tm.assert_frame_equal(df1, expected.iloc[:1]) + df2 = parser.read_csv(StringIO(s2), header=[0, 1]) + tm.assert_frame_equal(df2, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 4dfb8d3bd2dc8..f67a658cadfa2 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -5,10 +5,11 @@ """ from io import StringIO +import numpy as np import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("with_header", [True, False]) @@ -21,9 +22,7 @@ def test_index_col_named(all_parsers, with_header): KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - header = ( - "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" - ) # noqa + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" if with_header: data = header + no_header @@ -174,3 +173,14 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): ), ) tm.assert_frame_equal(result, expected) + + +def test_no_multi_index_level_names_empty(all_parsers): + # GH 10984 + parser = all_parsers + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) + with tm.ensure_clean() as path: + expected.to_csv(path) + result = parser.read_csv(path, index_col=[0, 1, 2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d144421090274..5c4e642115798 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index c94adf9da0bf3..64ccaf60ec230 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm def _construct_dataframe(num_rows): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index f154d09358dc1..f9a083d7f5d22 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,10 +7,10 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +from pandas._libs.parsers import STR_NA_VALUES -import pandas.io.common as com +from pandas import DataFrame, Index, MultiIndex +import pandas._testing as tm def test_string_nas(all_parsers): @@ -89,6 +89,7 @@ def test_default_na_values(all_parsers): "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", @@ -99,7 +100,7 @@ def test_default_na_values(all_parsers): "#N/A N/A", "", } - assert _NA_VALUES == com._NA_VALUES + assert _NA_VALUES == STR_NA_VALUES parser = all_parsers nv = len(_NA_VALUES) @@ -536,3 +537,31 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): dtype={"a": "bool"}, na_values=na_values, ) + + +def test_str_nan_dropped(all_parsers): + # see gh-21131 + parser = all_parsers + + data = """File: small.csv,, +10010010233,0123,654 +foo,,bar +01001000155,4530,898""" + + result = parser.read_csv( + StringIO(data), + header=None, + names=["col1", "col2", "col3"], + dtype={"col1": str, "col2": str, "col3": str}, + ).dropna() + + expected = DataFrame( + { + "col1": ["10010010233", "01001000155"], + "col2": ["0123", "4530"], + "col3": ["654", "898"], + }, + index=[1, 3], + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index dbe721b10a3ce..b8d66874bc660 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -11,7 +11,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv @@ -166,7 +166,7 @@ def test_s3_fails(self): # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): - read_csv("s3://cant_get_it/") + read_csv("s3://cant_get_it/file.csv") def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -184,6 +184,8 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks + import s3fs + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) buf = BytesIO() str_buf = StringIO() @@ -194,7 +196,13 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - with caplog.at_level(logging.DEBUG, logger="s3fs.core"): + # Possibly some state leaking in between tests. + # If we don't clear this cache, we saw `GetObject operation: Forbidden`. + # Presumably the s3fs instance is being cached, with the directory listing + # from *before* we add the large-file.csv in the pandas-test bucket. + s3fs.S3FileSystem.clear_instance_cache() + + with caplog.at_level(logging.DEBUG, logger="s3fs"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert (0, 5505024) in {x.args[-2:] for x in caplog.records} diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 36391e19a102e..b01b22e811ee3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -20,8 +20,8 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5b381e43e3e19..7367b19b40dc3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -13,7 +13,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_default_separator(python_parser_only): diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 94858226d0b44..14773dfbea20e 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -11,7 +11,7 @@ from pandas.errors import ParserError from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 9ddaccc4d38b7..27aef2376e87d 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import EmptyDataError, read_csv, read_fwf diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index d4f219d13ac53..fdccef1127c7e 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -12,7 +12,7 @@ from pandas.errors import EmptyDataError from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 75a5b7cd53ddb..8d5af85c20d33 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,7 +12,7 @@ from pandas._libs.parsers import TextReader from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import TextFileReader, read_csv @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self): assert_array_dicts_equal(recs, expected) def test_escapechar(self): - data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"' + data = '\\"hello world"\n\\"hello world"\n\\"hello world"' reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") result = reader.read() diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index b23ddf5bd9292..267fae760398a 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.parsers as parsers from pandas.io.parsers import read_csv @@ -96,9 +96,9 @@ def test_python_engine(self, python_engine): for default in py_unsupported: msg = ( - "The {default!r} option is not supported with the {python_engine!r}" - " engine" - ).format(default=default, python_engine=python_engine) + f"The {repr(default)} option is not " + f"supported with the {repr(python_engine)} engine" + ) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index afe19608ea5c6..979eb4702cc84 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -10,7 +10,7 @@ from pandas._libs.tslib import Timestamp from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm _msg_validate_usecols_arg = ( "'usecols' must either be list-like " @@ -199,7 +199,7 @@ def test_usecols_with_whitespace(all_parsers): # Column selection by index. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),), ], ) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py index 6164f5d0722cc..214f95c6fb441 100644 --- a/pandas/tests/io/pytables/conftest.py +++ b/pandas/tests/io/pytables/conftest.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index 1e320e12a4a53..c7200385aa998 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,29 +1,30 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path -import pandas.util.testing as tm tables = pytest.importorskip("tables") @pytest.fixture def pytables_hdf5_file(): - """Use PyTables to create a simple HDF5 file.""" - + """ + Use PyTables to create a simple HDF5 file. + """ table_schema = { "c0": tables.Time64Col(pos=0), "c1": tables.StringCol(5, pos=1), "c2": tables.Int64Col(pos=2), } - t0 = 1561105000.0 + t0 = 1_561_105_000.0 testsamples = [ {"c0": t0, "c1": "aaaaa", "c2": 1}, {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, {"c0": t0 + 2, "c1": "ccccc", "c2": 10 ** 5}, - {"c0": t0 + 3, "c1": "ddddd", "c2": 4294967295}, + {"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295}, ] objname = "pandas_test_timeseries" diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 91ee1061a5ef1..543940e674dba 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store -import pandas.util.testing as tm from pandas.io.pytables import read_hdf diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py index 4ceb80889c989..9adb0a6d227da 100644 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @td.skip_if_installed("tables") diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index f9d525399bde3..64c4ad800f49d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -3,13 +3,14 @@ from distutils.version import LooseVersion from io import BytesIO import os +from pathlib import Path import re from warnings import catch_warnings, simplefilter import numpy as np import pytest -from pandas.compat import PY36, is_platform_little_endian, is_platform_windows +from pandas.compat import is_platform_little_endian, is_platform_windows import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_categorical_dtype @@ -32,6 +33,7 @@ isna, timedelta_range, ) +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, create_tempfile, @@ -41,7 +43,6 @@ safe_remove, tables, ) -import pandas.util.testing as tm from pandas.io.pytables import ( ClosedFileError, @@ -65,8 +66,11 @@ class TestHDFStore: def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 + + msg = "format is not a defined argument for HDFStore" + with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): HDFStore(path, format="table") def test_context(self, setup_path): @@ -202,21 +206,27 @@ def test_api(self, setup_path): # Invalid. df = tm.makeDataFrame() - with pytest.raises(ValueError): + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="f") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="fixed") - with pytest.raises(TypeError): + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): df.to_hdf(path, "df", append=True, format="foo") - with pytest.raises(TypeError): - df.to_hdf(path, "df", append=False, format="bar") + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") # File path doesn't exist path = "" - with pytest.raises(FileNotFoundError): + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): read_hdf(path, "df") def test_api_default_format(self, setup_path): @@ -229,7 +239,10 @@ def test_api_default_format(self, setup_path): _maybe_remove(store, "df") store.put("df", df) assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): store.append("df2", df) pd.set_option("io.hdf.default_format", "table") @@ -250,7 +263,7 @@ def test_api_default_format(self, setup_path): df.to_hdf(path, "df") with HDFStore(path) as store: assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df2", append=True) pd.set_option("io.hdf.default_format", "table") @@ -383,7 +396,10 @@ def test_versioning(self, setup_path): # this is an error because its table_type is appendable, but no # version info store.get_node("df2")._v_attrs.pandas_version = None - with pytest.raises(Exception): + + msg = "'NoneType' object has no attribute 'startswith'" + + with pytest.raises(Exception, match=msg): store.select("df2") def test_mode(self, setup_path): @@ -427,7 +443,11 @@ def check(mode): # conv read if mode in ["w"]: - with pytest.raises(ValueError): + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) @@ -1272,7 +1292,7 @@ def test_append_with_different_block_ordering(self, setup_path): with pytest.raises(ValueError): store.append("df", df) - # store multile additional fields in different blocks + # store multiple additional fields in different blocks df["float_3"] = Series([1.0] * len(df), dtype="float64") with pytest.raises(ValueError): store.append("df", df) @@ -2376,8 +2396,8 @@ def test_frame(self, compression, setup_path): @td.xfail_non_writeable def test_empty_series_frame(self, setup_path): - s0 = Series() - s1 = Series(name="myseries") + s0 = Series(dtype=object) + s1 = Series(name="myseries", dtype=object) df0 = DataFrame() df1 = DataFrame(index=["a", "b", "c"]) df2 = DataFrame(columns=["d", "e", "f"]) @@ -2806,16 +2826,16 @@ def test_select_iterator(self, setup_path): expected = store.select("df") - results = [s for s in store.select("df", iterator=True)] + results = list(store.select("df", iterator=True)) result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=100)] + results = list(store.select("df", chunksize=100)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select("df", chunksize=150)] + results = list(store.select("df", chunksize=150)) result = concat(results) tm.assert_frame_equal(result, expected) @@ -2835,7 +2855,7 @@ def test_select_iterator(self, setup_path): df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") - results = [s for s in read_hdf(path, "df", chunksize=100)] + results = list(read_hdf(path, "df", chunksize=100)) result = concat(results) assert len(results) == 5 @@ -2856,12 +2876,9 @@ def test_select_iterator(self, setup_path): # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") - results = [ - s - for s in store.select_as_multiple( - ["df1", "df2"], selector="df1", chunksize=150 - ) - ] + results = list( + store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150) + ) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2916,19 +2933,19 @@ def test_select_iterator_complete_8014(self, setup_path): end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [s for s in store.select("df", chunksize=chunksize)] + results = list(store.select("df", chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2936,7 +2953,7 @@ def test_select_iterator_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) @@ -2958,14 +2975,14 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) @@ -2974,7 +2991,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[ (expected.index >= beg_dt) & (expected.index <= end_dt) @@ -2992,7 +3009,7 @@ def test_select_iterator_non_complete_8014(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index > '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert 0 == len(results) def test_select_iterator_many_empty_frames(self, setup_path): @@ -3014,14 +3031,14 @@ def test_select_iterator_many_empty_frames(self, setup_path): # select w/iterator and where clause, single term, begin of range where = "index >= '{beg_dt}'".format(beg_dt=beg_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '{end_dt}'".format(end_dt=end_dt) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) assert len(results) == 1 result = concat(results) @@ -3032,7 +3049,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index >= '{beg_dt}' & index <= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be 1, is 10 assert len(results) == 1 @@ -3052,7 +3069,7 @@ def test_select_iterator_many_empty_frames(self, setup_path): where = "index <= '{beg_dt}' & index >= '{end_dt}'".format( beg_dt=beg_dt, end_dt=end_dt ) - results = [s for s in store.select("df", where=where, chunksize=chunksize)] + results = list(store.select("df", where=where, chunksize=chunksize)) # should be [] assert len(results) == 0 @@ -3216,7 +3233,7 @@ def test_frame_select_complex(self, setup_path): tm.assert_frame_equal(result, expected) result = store.select( - "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' ) expected = df.loc[ ((df.index > df.index[3]) & (df.index <= df.index[6])) @@ -4597,12 +4614,9 @@ def test_read_nokey_empty(self, setup_path): with pytest.raises(ValueError): read_hdf(path) - @td.skip_if_no("pathlib") def test_read_from_pathlib_path(self, setup_path): # GH11773 - from pathlib import Path - expected = DataFrame( np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") ) @@ -4711,7 +4725,6 @@ def test_read_hdf_series_mode_r(self, format, setup_path): result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) - @pytest.mark.skipif(not PY36, reason="Need python 3.6") def test_fspath(self): with tm.ensure_clean("foo.h5") as path: with pd.HDFStore(path) as store: diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1acb0ac6e06d2..2bf22d982e5fe 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -7,12 +7,12 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_path, ensure_clean_store, ) -import pandas.util.testing as tm def _compare_with_tz(a, b): diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index fcd2e0e35ad9e..5d2643c20ceb2 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -3,7 +3,7 @@ import pytest from pandas import read_sas -import pandas.util.testing as tm +import pandas._testing as tm class TestSas: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index e37561c865c7a..62e9ac6929c8e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,5 +1,7 @@ +from datetime import datetime import io import os +from pathlib import Path import numpy as np import pytest @@ -8,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # https://github.com/cython/cython/issues/1720 @@ -20,9 +22,9 @@ def setup_method(self, datapath): self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: - fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) + fname = os.path.join(self.dirpath, f"test_sas7bdat_{j}.csv") df = pd.read_csv(fname) - epoch = pd.datetime(1960, 1, 1) + epoch = datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit="d") df["Column4"] = epoch + t1 t2 = pd.to_timedelta(df["Column12"], unit="d") @@ -37,7 +39,7 @@ def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -45,7 +47,7 @@ def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) @@ -60,7 +62,7 @@ def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) @@ -68,14 +70,11 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() - @td.skip_if_no("pathlib") def test_path_pathlib(self): - from pathlib import Path - for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = Path(os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))) + fname = Path(os.path.join(self.dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -86,9 +85,7 @@ def test_path_localpath(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = LocalPath( - os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) - ) + fname = LocalPath(os.path.join(self.dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -97,7 +94,7 @@ def test_iterator_loop(self): for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") y = 0 for x in rdr: @@ -108,7 +105,7 @@ def test_iterator_loop(self): def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") d1 = rdr.read(rdr.row_count + 20) rdr.close() diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 7893877be2033..ee97f08ef9400 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.sas.sasreader import read_sas @@ -104,7 +104,7 @@ def test1_incremental(self): reader = read_sas(self.file01, index="SEQN", chunksize=1000) - all_data = [x for x in reader] + all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 33e6d3b05100e..a69e5556f3e85 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -6,10 +6,9 @@ import pandas as pd from pandas import DataFrame, get_option, read_clipboard -import pandas.util.testing as tm +import pandas._testing as tm -from pandas.io.clipboard import clipboard_get, clipboard_set -from pandas.io.clipboard.exceptions import PyperclipException +from pandas.io.clipboard import PyperclipException, clipboard_get, clipboard_set try: DataFrame({"A": [1, 2]}).to_clipboard() @@ -259,6 +258,7 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.clipboard @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) +@pytest.mark.xfail(reason="flaky in CI", strict=False) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 655fd9d01c1c0..a126f83164ce5 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -4,6 +4,7 @@ from io import StringIO import mmap import os +from pathlib import Path import pytest @@ -11,7 +12,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom @@ -27,14 +28,7 @@ def __fspath__(self): # Functions that consume a string path and return a string or path-like object -path_types = [str, CustomFSPath] - -try: - from pathlib import Path - - path_types.append(Path) -except ImportError: - pass +path_types = [str, CustomFSPath, Path] try: from py.path import local as LocalPath @@ -48,7 +42,6 @@ def __fspath__(self): # https://github.com/cython/cython/issues/1720 @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCommonIOCapabilities: data1 = """index,A,B,C,D foo,2,3,4,5 @@ -74,11 +67,10 @@ def test_expand_user_normal_path(self): assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name - @td.skip_if_no("pathlib") def test_stringify_path_pathlib(self): - rel_path = icom._stringify_path(Path(".")) + rel_path = icom.stringify_path(Path(".")) assert rel_path == "." - redundant_path = icom._stringify_path(Path("foo//bar")) + redundant_path = icom.stringify_path(Path("foo//bar")) assert redundant_path == os.path.join("foo", "bar") @td.skip_if_no("py.path") @@ -86,11 +78,11 @@ def test_stringify_path_localpath(self): path = os.path.join("foo", "bar") abs_path = os.path.abspath(path) lpath = LocalPath(path) - assert icom._stringify_path(lpath) == abs_path + assert icom.stringify_path(lpath) == abs_path def test_stringify_path_fspath(self): p = CustomFSPath("foo/bar.csv") - result = icom._stringify_path(p) + result = icom.stringify_path(p) assert result == "foo/bar.csv" @pytest.mark.parametrize( @@ -100,7 +92,7 @@ def test_stringify_path_fspath(self): @pytest.mark.parametrize("path_type", path_types) def test_infer_compression_from_path(self, extension, expected, path_type): path = path_type("foo/bar.csv" + extension) - compression = icom._infer_compression(path, compression="infer") + compression = icom.infer_compression(path, compression="infer") assert compression == expected def test_get_filepath_or_buffer_with_path(self): @@ -142,7 +134,6 @@ def test_iterator(self): (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), (pd.read_json, "os", ValueError, "json"), - (pd.read_msgpack, "os", FileNotFoundError, "mp"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) @@ -151,17 +142,19 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory: '.+does_not_exist" r"\.{}'" - ).format(fn_ext) + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) @@ -177,7 +170,6 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): (pd.read_stata, "os", FileNotFoundError, "dta"), (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), (pd.read_json, "os", ValueError, "json"), - (pd.read_msgpack, "os", FileNotFoundError, "mp"), (pd.read_pickle, "os", FileNotFoundError, "pickle"), ], ) @@ -189,40 +181,53 @@ def test_read_expands_user_home_dir( path = os.path.join("~", "does_not_exist." + fn_ext) monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory:" r" '.+does_not_exist\.{}'" - ).format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) @pytest.mark.parametrize( "reader, module, path", [ - (pd.read_csv, "os", ("io", "data", "iris.csv")), - (pd.read_table, "os", ("io", "data", "iris.csv")), - (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")), - (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")), - (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")), + (pd.read_csv, "os", ("data", "iris.csv")), + (pd.read_table, "os", ("data", "iris.csv")), + ( + pd.read_fwf, + "os", + ("io", "data", "fixed_width", "fixed_width_format.txt"), + ), + (pd.read_excel, "xlrd", ("io", "data", "excel", "test1.xlsx")), + ( + pd.read_feather, + "feather", + ("io", "data", "feather", "feather-0_3_1.feather"), + ), ( pd.read_hdf, "tables", ("io", "data", "legacy_hdf", "datetimetz_object.h5"), ), - (pd.read_stata, "os", ("io", "data", "stata10_115.dta")), + (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")), (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), - (pd.read_msgpack, "os", ("io", "msgpack", "data", "frame.mp")), - (pd.read_pickle, "os", ("io", "data", "categorical.0.25.0.pickle")), + ( + pd.read_pickle, + "os", + ("io", "data", "pickle", "categorical.0.25.0.pickle"), + ), ], ) def test_read_fspath_all(self, reader, module, path, datapath): @@ -248,7 +253,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_html", {}, "os"), ("to_json", {}, "os"), ("to_latex", {}, "os"), - ("to_msgpack", {}, "os"), ("to_pickle", {}, "os"), ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], @@ -296,7 +300,7 @@ def test_write_fspath_hdf5(self): @pytest.fixture def mmap_file(datapath): - return datapath("io", "data", "test_mmap.csv") + return datapath("io", "data", "csv", "test_mmap.csv") class TestMMapWrapper: @@ -313,18 +317,18 @@ def test_constructor_bad_file(self, mmap_file): err = mmap.error with pytest.raises(err, match=msg): - icom.MMapWrapper(non_file) + icom._MMapWrapper(non_file) target = open(mmap_file, "r") target.close() msg = "I/O operation on closed file" with pytest.raises(ValueError, match=msg): - icom.MMapWrapper(target) + icom._MMapWrapper(target) def test_get_attr(self, mmap_file): with open(mmap_file, "r") as target: - wrapper = icom.MMapWrapper(target) + wrapper = icom._MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs if not attr.startswith("__")] @@ -337,7 +341,7 @@ def test_get_attr(self, mmap_file): def test_next(self, mmap_file): with open(mmap_file, "r") as target: - wrapper = icom.MMapWrapper(target) + wrapper = icom._MMapWrapper(target) lines = target.readlines() for line in lines: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index d68b6a1effaa0..fb81e57912dac 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,29 +1,16 @@ -import contextlib import os import subprocess import sys import textwrap -import warnings import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom -@contextlib.contextmanager -def catch_to_csv_depr(): - # Catching warnings because Series.to_csv has - # been deprecated. Remove this context when - # Series.to_csv has been aligned. - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - yield - - @pytest.mark.parametrize( "obj", [ @@ -37,12 +24,11 @@ def catch_to_csv_depr(): @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): with tm.ensure_clean() as path: - with catch_to_csv_depr(): - getattr(obj, method)(path, compression=compression_only) - compressed_size = os.path.getsize(path) - getattr(obj, method)(path, compression=None) - uncompressed_size = os.path.getsize(path) - assert uncompressed_size > compressed_size + getattr(obj, method)(path, compression=compression_only) + compressed_size = os.path.getsize(path) + getattr(obj, method)(path, compression=None) + uncompressed_size = os.path.getsize(path) + assert uncompressed_size > compressed_size @pytest.mark.parametrize( @@ -58,19 +44,17 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, "w", compression=compression_only) - with catch_to_csv_depr(): - with f: - getattr(obj, method)(f) - assert not f.closed - assert f.closed - compressed_size = os.path.getsize(path) + f, handles = icom.get_handle(path, "w", compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, "w", compression=None) - with catch_to_csv_depr(): - with f: - getattr(obj, method)(f) - assert not f.closed + f, handles = icom.get_handle(path, "w", compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed assert f.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @@ -124,7 +108,7 @@ def test_compression_warning(compression_only): columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, "w", compression=compression_only) + f, handles = icom.get_handle(path, "w", compression=compression_only) with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) @@ -140,7 +124,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) def test_with_missing_lzma_runtime(): @@ -157,4 +141,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output([sys.executable, "-c", code]) + subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index 2fa5e3b30d6af..cdb8eca02a3e5 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -2,7 +2,7 @@ import numpy as np -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0f68a6534dad1..0038df78dd866 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip @@ -107,23 +107,6 @@ def test_unsupported_other(self): # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) - def test_rw_nthreads(self): - df = pd.DataFrame({"A": np.arange(100000)}) - expected_warning = ( - "the 'nthreads' keyword is deprecated, use 'use_threads' instead" - ) - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=2) - # we have an extra FutureWarning because of #GH23752 - assert any(expected_warning in str(x) for x in w) - - # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - self.check_round_trip(df, nthreads=1) - # we have an extra FutureWarnings because of #GH23752 - assert any(expected_warning in str(x) for x in w) - def test_rw_use_threads(self): df = pd.DataFrame({"A": np.arange(100000)}) self.check_round_trip(df, use_threads=True) @@ -153,7 +136,7 @@ def test_write_with_index(self): # column multi-index df.index = [0, 1, 2] - df.columns = (pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),) + df.columns = pd.MultiIndex.from_tuples([("a", 1)]) self.check_error_on_write(df, ValueError) def test_path_pathlib(self): diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 52147f4e1afc7..7a5eba5264421 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -1,6 +1,9 @@ +from contextlib import ExitStack as does_not_raise from datetime import datetime import os import platform +import random +import string import numpy as np import pytest @@ -18,11 +21,6 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = "pydata_pandas_bq_testing_py3" - -TABLE_ID = "new_test" -DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) - VERSION = platform.python_version() @@ -70,6 +68,10 @@ def _get_client(): return bigquery.Client(project=project_id, credentials=credentials) +def generate_rand_str(length: int = 10) -> str: + return "".join(random.choices(string.ascii_lowercase, k=length)) + + def make_mixed_dataframe_v2(test_size): # create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) @@ -89,21 +91,6 @@ def make_mixed_dataframe_v2(test_size): ) -def test_read_gbq_with_deprecated_kwargs(monkeypatch): - captured_kwargs = {} - - def mock_read_gbq(sql, **kwargs): - captured_kwargs.update(kwargs) - return DataFrame([[1.0]]) - - monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) - private_key = object() - pd.read_gbq("SELECT 1", verbose=True, private_key=private_key) - - assert captured_kwargs["verbose"] - assert captured_kwargs["private_key"] is private_key - - def test_read_gbq_without_deprecated_kwargs(monkeypatch): captured_kwargs = {} @@ -144,36 +131,49 @@ def mock_read_gbq(sql, **kwargs): assert "use_bqstorage_api" not in captured_kwargs +@pytest.mark.parametrize("progress_bar", [None, "foo"]) +def test_read_gbq_progress_bar_type_kwarg(monkeypatch, progress_bar): + # GH 29857 + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1", progress_bar_type=progress_bar) + + if progress_bar: + assert "progress_bar_type" in captured_kwargs + else: + assert "progress_bar_type" not in captured_kwargs + + @pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath: - @classmethod - def setup_class(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - + @pytest.fixture() + def gbq_dataset(self): + # Setup Dataset _skip_if_no_project_id() _skip_if_no_private_key_path() - cls.client = _get_client() - cls.dataset = cls.client.dataset(DATASET_ID + "1") - try: - # Clean-up previous test runs. - cls.client.delete_dataset(cls.dataset, delete_contents=True) - except api_exceptions.NotFound: - pass # It's OK if the dataset doesn't already exist. + dataset_id = "pydata_pandas_bq_testing_" + generate_rand_str() + + self.client = _get_client() + self.dataset = self.client.dataset(dataset_id) - cls.client.create_dataset(bigquery.Dataset(cls.dataset)) + # Create the dataset + self.client.create_dataset(bigquery.Dataset(self.dataset)) - @classmethod - def teardown_class(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - cls.client.delete_dataset(cls.dataset, delete_contents=True) + table_name = generate_rand_str() + destination_table = f"{dataset_id}.{table_name}" + yield destination_table - def test_roundtrip(self): - destination_table = DESTINATION_TABLE + "1" + # Teardown Dataset + self.client.delete_dataset(self.dataset, delete_contents=True) + + def test_roundtrip(self, gbq_dataset): + destination_table = gbq_dataset test_size = 20001 df = make_mixed_dataframe_v2(test_size) @@ -186,9 +186,50 @@ def test_roundtrip(self): ) result = pd.read_gbq( - "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", project_id=_get_project_id(), credentials=_get_credentials(), dialect="standard", ) assert result["num_rows"][0] == test_size + + @pytest.mark.parametrize( + "if_exists, expected_num_rows, expectation", + [ + ("append", 300, does_not_raise()), + ("fail", 200, pytest.raises(pandas_gbq.gbq.TableCreationError)), + ("replace", 100, does_not_raise()), + ], + ) + def test_gbq_if_exists( + self, if_exists, expected_num_rows, expectation, gbq_dataset + ): + # GH 29598 + destination_table = gbq_dataset + + test_size = 200 + df = make_mixed_dataframe_v2(test_size) + + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + with expectation: + df.iloc[:100].to_gbq( + destination_table, + _get_project_id(), + if_exists=if_exists, + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == expected_num_rows diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 85ac56c8193a6..557a9d5c13987 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -5,8 +5,8 @@ import pytest from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm from pandas.util import _test_decorators as td -import pandas.util.testing as tm from pandas.io.common import is_gcs_url diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 3d855a12d5481..626df839363cb 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -15,7 +15,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.common import file_path_to_url import pandas.io.html @@ -63,7 +63,7 @@ def test_bs4_version_fails(monkeypatch, datapath): monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): - read_html(datapath("io", "data", "spam.html"), flavor="bs4") + read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4") def test_invalid_flavor(): @@ -78,7 +78,7 @@ def test_invalid_flavor(): @td.skip_if_no("bs4") @td.skip_if_no("lxml") def test_same_ordering(datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -87,7 +87,7 @@ def test_same_ordering(datapath): @pytest.mark.parametrize( "flavor", [ - pytest.param("bs4", marks=td.skip_if_no("lxml")), + pytest.param("bs4", marks=td.skip_if_no("bs4")), pytest.param("lxml", marks=td.skip_if_no("lxml")), ], scope="class", @@ -95,10 +95,10 @@ def test_same_ordering(datapath): class TestReadHtml: @pytest.fixture(autouse=True) def set_files(self, datapath): - self.spam_data = datapath("io", "data", "spam.html") + self.spam_data = datapath("io", "data", "html", "spam.html") self.spam_data_kwargs = {} self.spam_data_kwargs["encoding"] = "UTF-8" - self.banklist_data = datapath("io", "data", "banklist.html") + self.banklist_data = datapath("io", "data", "html", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -135,7 +135,7 @@ def test_banklist_url(self): def test_spam_url(self): url = ( "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/spam.html" + "pandas/tests/io/data/html/spam.html" ) df1 = self.read_html(url, ".*Water.*") df2 = self.read_html(url, "Unit") @@ -178,7 +178,7 @@ def test_skiprows_int(self): assert_framelist_equal(df1, df2) - def test_skiprows_xrange(self): + def test_skiprows_range(self): df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0] df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) @@ -376,16 +376,24 @@ def test_python_docs_table(self): @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] assert not any(s.isna().any() for _, s in df.items()) @pytest.mark.slow - def test_thousands_macau_index_col(self, datapath): + def test_thousands_macau_index_col(self, datapath, request): + # https://github.com/pandas-dev/pandas/issues/29622 + # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly + if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( + "bs4", "4.8.0" + ): + reason = "fails for bs4 version >= 4.8.0" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + all_non_nan_table_index = -2 - macau_data = datapath("io", "data", "macau.html") + macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -395,8 +403,7 @@ def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ - result = self.read_html( - """ + html = """
is for use inside
0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
{val}{40 + h}{val}31
@@ -416,8 +423,7 @@ def test_empty_tables(self):
""" - ) - + result = self.read_html(html) assert len(result) == 1 def test_multiple_tbody(self): @@ -566,7 +572,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): - data = datapath("io", "data", "nyse_wsj.html") + data = datapath("io", "data", "html", "nyse_wsj.html") df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] expected = Index( @@ -594,7 +600,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( - datapath("io", "data", "banklist.csv"), + datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, ) assert df.shape == ground_truth.shape @@ -889,7 +895,7 @@ def test_parse_dates_combine(self): tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") msg = ( r"Passed header=\[0,1\] are too many " r"rows for this multi_index of columns" @@ -897,13 +903,13 @@ def test_computer_sales_page(self, datapath): with pytest.raises(ParserError, match=msg): self.read_html(data, header=[0, 1]) - data = datapath("io", "data", "computer_sales_page.html") + data = datapath("io", "data", "html", "computer_sales_page.html") assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): - data = datapath("io", "data", "wikipedia_states.html") - assert os.path.isfile(data), "{data!r} is not a file".format(data=data) - assert os.path.getsize(data), "{data!r} is an empty file".format(data=data) + data = datapath("io", "data", "html", "wikipedia_states.html") + assert os.path.isfile(data), f"{repr(data)} is not a file" + assert os.path.getsize(data), f"{repr(data)} is an empty file" result = self.read_html(data, "Arizona", header=1)[0] assert result["sq mi"].dtype == np.dtype("float64") @@ -1095,14 +1101,14 @@ def test_multiple_header_rows(self): tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): - banklist_data = datapath("io", "data", "banklist.html") + banklist_data = datapath("io", "data", "html", "banklist.html") self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): @@ -1240,7 +1246,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = datapath("io", "data", "valid_markup.html") + filename = datapath("io", "data", "html", "valid_markup.html") helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py new file mode 100644 index 0000000000000..a1f9c6f6af51a --- /dev/null +++ b/pandas/tests/io/test_orc.py @@ -0,0 +1,227 @@ +""" test orc compat """ +import datetime +import os + +import numpy as np +import pytest + +import pandas as pd +from pandas import read_orc +import pandas._testing as tm + +pytest.importorskip("pyarrow", minversion="0.13.0") +pytest.importorskip("pyarrow.orc") + +pytestmark = pytest.mark.filterwarnings( + "ignore:RangeIndex.* is deprecated:DeprecationWarning" +) + + +@pytest.fixture +def dirpath(datapath): + return datapath("io", "data", "orc") + + +def test_orc_reader_empty(dirpath): + columns = [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + "bytes1", + "string1", + ] + dtypes = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "object", + "object", + ] + expected = pd.DataFrame(index=pd.RangeIndex(0)) + for colname, dtype in zip(columns, dtypes): + expected[colname] = pd.Series(dtype=dtype) + + inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") + got = read_orc(inputfile, columns=columns) + + tm.assert_equal(expected, got) + + +def test_orc_reader_basic(dirpath): + data = { + "boolean1": np.array([False, True], dtype="bool"), + "byte1": np.array([1, 100], dtype="int8"), + "short1": np.array([1024, 2048], dtype="int16"), + "int1": np.array([65536, 65536], dtype="int32"), + "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"), + "float1": np.array([1.0, 2.0], dtype="float32"), + "double1": np.array([-15.0, -5.0], dtype="float64"), + "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"), + "string1": np.array(["hi", "bye"], dtype="object"), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc") + got = read_orc(inputfile, columns=data.keys()) + + tm.assert_equal(expected, got) + + +def test_orc_reader_decimal(dirpath): + from decimal import Decimal + + # Only testing the first 10 rows of data + data = { + "_col0": np.array( + [ + Decimal("-1000.50000"), + Decimal("-999.60000"), + Decimal("-998.70000"), + Decimal("-997.80000"), + Decimal("-996.90000"), + Decimal("-995.10000"), + Decimal("-994.11000"), + Decimal("-993.12000"), + Decimal("-992.13000"), + Decimal("-991.14000"), + ], + dtype="object", + ) + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_low(dirpath): + data = { + "time": np.array( + [ + "1900-05-05 12:34:56.100000", + "1900-05-05 12:34:56.100100", + "1900-05-05 12:34:56.100200", + "1900-05-05 12:34:56.100300", + "1900-05-05 12:34:56.100400", + "1900-05-05 12:34:56.100500", + "1900-05-05 12:34:56.100600", + "1900-05-05 12:34:56.100700", + "1900-05-05 12:34:56.100800", + "1900-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + datetime.date(1900, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_date_high(dirpath): + data = { + "time": np.array( + [ + "2038-05-05 12:34:56.100000", + "2038-05-05 12:34:56.100100", + "2038-05-05 12:34:56.100200", + "2038-05-05 12:34:56.100300", + "2038-05-05 12:34:56.100400", + "2038-05-05 12:34:56.100500", + "2038-05-05 12:34:56.100600", + "2038-05-05 12:34:56.100700", + "2038-05-05 12:34:56.100800", + "2038-05-05 12:34:56.100900", + ], + dtype="datetime64[ns]", + ), + "date": np.array( + [ + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + datetime.date(2038, 12, 25), + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) + + +def test_orc_reader_snappy_compressed(dirpath): + data = { + "int1": np.array( + [ + -1160101563, + 1181413113, + 2065821249, + -267157795, + 172111193, + 1752363137, + 1406072123, + 1911809390, + -1308542224, + -467100286, + ], + dtype="int32", + ), + "string1": np.array( + [ + "f50dcb8", + "382fdaaa", + "90758c6", + "9e8caf3f", + "ee97332b", + "d634da1", + "2bea4396", + "d67d89e8", + "ad71007e", + "e8c82066", + ], + dtype="object", + ), + } + expected = pd.DataFrame.from_dict(data) + + inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc") + got = read_orc(inputfile).iloc[:10] + + tm.assert_equal(expected, got) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py deleted file mode 100644 index f8005273319e0..0000000000000 --- a/pandas/tests/io/test_packers.py +++ /dev/null @@ -1,911 +0,0 @@ -import datetime -import glob -from io import BytesIO -import os -from warnings import catch_warnings, filterwarnings - -import numpy as np -import pytest - -from pandas._libs.tslib import iNaT -from pandas.errors import PerformanceWarning - -import pandas -from pandas import ( - Categorical, - DataFrame, - Index, - Interval, - MultiIndex, - NaT, - Period, - Series, - Timestamp, - bdate_range, - date_range, - period_range, -) -import pandas.util.testing as tm - -from pandas.io.packers import read_msgpack, to_msgpack - -nan = np.nan - -try: - import blosc # NOQA -except ImportError: - _BLOSC_INSTALLED = False -else: - _BLOSC_INSTALLED = True - -try: - import zlib # NOQA -except ImportError: - _ZLIB_INSTALLED = False -else: - _ZLIB_INSTALLED = True - - -@pytest.fixture(scope="module") -def current_packers_data(): - # our current version packers data - from pandas.tests.io.generate_legacy_storage_files import create_msgpack_data - - return create_msgpack_data() - - -@pytest.fixture(scope="module") -def all_packers_data(): - # our all of our current version packers data - from pandas.tests.io.generate_legacy_storage_files import create_data - - return create_data() - - -def check_arbitrary(a, b): - - if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): - assert len(a) == len(b) - for a_, b_ in zip(a, b): - check_arbitrary(a_, b_) - elif isinstance(a, DataFrame): - tm.assert_frame_equal(a, b) - elif isinstance(a, Series): - tm.assert_series_equal(a, b) - elif isinstance(a, Index): - tm.assert_index_equal(a, b) - elif isinstance(a, Categorical): - # Temp, - # Categorical.categories is changed from str to bytes in PY3 - # maybe the same as GH 13591 - if b.categories.inferred_type == "string": - pass - else: - tm.assert_categorical_equal(a, b) - elif a is NaT: - assert b is NaT - elif isinstance(a, Timestamp): - assert a == b - assert a.freq == b.freq - else: - assert a == b - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestPackers: - def setup_method(self, method): - self.path = "__{}__.msg".format(tm.rands(10)) - - def teardown_method(self, method): - pass - - def encode_decode(self, x, compress=None, **kwargs): - with tm.ensure_clean(self.path) as p: - to_msgpack(p, x, compress=compress, **kwargs) - return read_msgpack(p, **kwargs) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestAPI(TestPackers): - def test_string_io(self): - - df = DataFrame(np.random.randn(10, 2)) - s = df.to_msgpack(None) - result = read_msgpack(s) - tm.assert_frame_equal(result, df) - - s = df.to_msgpack() - result = read_msgpack(s) - tm.assert_frame_equal(result, df) - - s = df.to_msgpack() - result = read_msgpack(BytesIO(s)) - tm.assert_frame_equal(result, df) - - s = to_msgpack(None, df) - result = read_msgpack(s) - tm.assert_frame_equal(result, df) - - with tm.ensure_clean(self.path) as p: - - s = df.to_msgpack() - with open(p, "wb") as fh: - fh.write(s) - result = read_msgpack(p) - tm.assert_frame_equal(result, df) - - def test_path_pathlib(self): - df = tm.makeDataFrame() - result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack) - tm.assert_frame_equal(df, result) - - def test_path_localpath(self): - df = tm.makeDataFrame() - result = tm.round_trip_localpath(df.to_msgpack, read_msgpack) - tm.assert_frame_equal(df, result) - - def test_iterator_with_string_io(self): - - dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] - s = to_msgpack(None, *dfs) - for i, result in enumerate(read_msgpack(s, iterator=True)): - tm.assert_frame_equal(result, dfs[i]) - - def test_invalid_arg(self): - # GH10369 - class A: - def __init__(self): - self.read = 0 - - msg = "Invalid file path or buffer object type: " - invalid_path = os.path.join("nonexistent_dir", "df.msgpack") - with pytest.raises(ValueError, match=msg.format("NoneType")): - read_msgpack(path_or_buf=None) - with pytest.raises(ValueError, match=msg.format("dict")): - read_msgpack(path_or_buf={}) - with pytest.raises(ValueError, match=msg.format(r".*\.A")): - read_msgpack(path_or_buf=A()) - with pytest.raises(FileNotFoundError, match="does not exist"): - read_msgpack(path_or_buf=invalid_path) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestNumpy(TestPackers): - def test_numpy_scalar_float(self): - x = np.float32(np.random.rand()) - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_numpy_scalar_complex(self): - x = np.complex64(np.random.rand() + 1j * np.random.rand()) - x_rec = self.encode_decode(x) - assert np.allclose(x, x_rec) - - def test_scalar_float(self): - x = np.random.rand() - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_scalar_bool(self): - x = np.bool_(1) - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - x = np.bool_(0) - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_scalar_complex(self): - x = np.random.rand() + 1j * np.random.rand() - x_rec = self.encode_decode(x) - assert np.allclose(x, x_rec) - - def test_list_numpy_float(self): - x = [np.float32(np.random.rand()) for i in range(5)] - x_rec = self.encode_decode(x) - # current msgpack cannot distinguish list/tuple - tm.assert_almost_equal(tuple(x), x_rec) - - x_rec = self.encode_decode(tuple(x)) - tm.assert_almost_equal(tuple(x), x_rec) - - def test_list_numpy_float_complex(self): - if not hasattr(np, "complex128"): - pytest.skip("numpy can not handle complex128") - - x = [np.float32(np.random.rand()) for i in range(5)] + [ - np.complex128(np.random.rand() + 1j * np.random.rand()) for i in range(5) - ] - x_rec = self.encode_decode(x) - assert np.allclose(x, x_rec) - - def test_list_float(self): - x = [np.random.rand() for i in range(5)] - x_rec = self.encode_decode(x) - # current msgpack cannot distinguish list/tuple - tm.assert_almost_equal(tuple(x), x_rec) - - x_rec = self.encode_decode(tuple(x)) - tm.assert_almost_equal(tuple(x), x_rec) - - def test_list_float_complex(self): - x = [np.random.rand() for i in range(5)] + [ - (np.random.rand() + 1j * np.random.rand()) for i in range(5) - ] - x_rec = self.encode_decode(x) - assert np.allclose(x, x_rec) - - def test_dict_float(self): - x = {"foo": 1.0, "bar": 2.0} - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_dict_complex(self): - x = {"foo": 1.0 + 1.0j, "bar": 2.0 + 2.0j} - x_rec = self.encode_decode(x) - tm.assert_dict_equal(x, x_rec) - - for key in x: - tm.assert_class_equal(x[key], x_rec[key], obj="complex value") - - def test_dict_numpy_float(self): - x = {"foo": np.float32(1.0), "bar": np.float32(2.0)} - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_dict_numpy_complex(self): - x = {"foo": np.complex128(1.0 + 1.0j), "bar": np.complex128(2.0 + 2.0j)} - x_rec = self.encode_decode(x) - tm.assert_dict_equal(x, x_rec) - - for key in x: - tm.assert_class_equal(x[key], x_rec[key], obj="numpy complex128") - - def test_numpy_array_float(self): - - # run multiple times - for n in range(10): - x = np.random.rand(10) - for dtype in ["float32", "float64"]: - x = x.astype(dtype) - x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) - - def test_numpy_array_complex(self): - x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) - x_rec = self.encode_decode(x) - assert all(map(lambda x, y: x == y, x, x_rec)) and x.dtype == x_rec.dtype - - def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), "foo", np.bool_(1)] - x_rec = self.encode_decode(x) - # current msgpack cannot distinguish list/tuple - tm.assert_almost_equal(tuple(x), x_rec) - - x_rec = self.encode_decode(tuple(x)) - tm.assert_almost_equal(tuple(x), x_rec) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestBasic(TestPackers): - def test_timestamp(self): - - for i in [ - Timestamp("20130101"), - Timestamp("20130101", tz="US/Eastern"), - Timestamp("201301010501"), - ]: - i_rec = self.encode_decode(i) - assert i == i_rec - - def test_nat(self): - nat_rec = self.encode_decode(NaT) - assert NaT is nat_rec - - def test_datetimes(self): - - for i in [ - datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 1, 5, 1), - datetime.date(2013, 1, 1), - np.datetime64(datetime.datetime(2013, 1, 5, 2, 15)), - ]: - i_rec = self.encode_decode(i) - assert i == i_rec - - def test_timedeltas(self): - - for i in [ - datetime.timedelta(days=1), - datetime.timedelta(days=1, seconds=10), - np.timedelta64(1000000), - ]: - i_rec = self.encode_decode(i) - assert i == i_rec - - def test_periods(self): - # 13463 - for i in [Period("2010-09", "M"), Period("2014-Q1", "Q")]: - i_rec = self.encode_decode(i) - assert i == i_rec - - def test_intervals(self): - # 19967 - for i in [Interval(0, 1), Interval(0, 1, "left"), Interval(10, 25.0, "right")]: - i_rec = self.encode_decode(i) - assert i == i_rec - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestIndex(TestPackers): - def setup_method(self, method): - super().setup_method(method) - - self.d = { - "string": tm.makeStringIndex(100), - "date": tm.makeDateIndex(100), - "int": tm.makeIntIndex(100), - "rng": tm.makeRangeIndex(100), - "float": tm.makeFloatIndex(100), - "empty": Index([]), - "tuple": Index(zip(["foo", "bar", "baz"], [1, 2, 3])), - "period": Index(period_range("2012-1-1", freq="M", periods=3)), - "date2": Index(date_range("2013-01-1", periods=10)), - "bdate": Index(bdate_range("2013-01-02", periods=10)), - "cat": tm.makeCategoricalIndex(100), - "interval": tm.makeIntervalIndex(100), - "timedelta": tm.makeTimedeltaIndex(100, "H"), - } - - self.mi = { - "reg": MultiIndex.from_tuples( - [ - ("bar", "one"), - ("baz", "two"), - ("foo", "two"), - ("qux", "one"), - ("qux", "two"), - ], - names=["first", "second"], - ) - } - - def test_basic_index(self): - - for s, i in self.d.items(): - i_rec = self.encode_decode(i) - tm.assert_index_equal(i, i_rec) - - # datetime with no freq (GH5506) - i = Index([Timestamp("20130101"), Timestamp("20130103")]) - i_rec = self.encode_decode(i) - tm.assert_index_equal(i, i_rec) - - # datetime with timezone - i = Index( - [Timestamp("20130101 9:00:00"), Timestamp("20130103 11:00:00")] - ).tz_localize("US/Eastern") - i_rec = self.encode_decode(i) - tm.assert_index_equal(i, i_rec) - - def test_multi_index(self): - - for s, i in self.mi.items(): - i_rec = self.encode_decode(i) - tm.assert_index_equal(i, i_rec) - - def test_unicode(self): - i = tm.makeUnicodeIndex(100) - - i_rec = self.encode_decode(i) - tm.assert_index_equal(i, i_rec) - - def categorical_index(self): - # GH15487 - df = DataFrame(np.random.randn(10, 2)) - df = df.astype({0: "category"}).set_index(0) - result = self.encode_decode(df) - tm.assert_frame_equal(result, df) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestSeries(TestPackers): - def setup_method(self, method): - super().setup_method(method) - - self.d = {} - - s = tm.makeStringSeries() - s.name = "string" - self.d["string"] = s - - s = tm.makeObjectSeries() - s.name = "object" - self.d["object"] = s - - s = Series(iNaT, dtype="M8[ns]", index=range(5)) - self.d["date"] = s - - data = { - "A": [0.0, 1.0, 2.0, 3.0, np.nan], - "B": [0, 1, 0, 1, 0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": date_range("1/1/2009", periods=5), - "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], - "F": [Timestamp("20130102", tz="US/Eastern")] * 2 - + [Timestamp("20130603", tz="CET")] * 3, - "G": [Timestamp("20130102", tz="US/Eastern")] * 5, - "H": Categorical([1, 2, 3, 4, 5]), - "I": Categorical([1, 2, 3, 4, 5], ordered=True), - "J": (np.bool_(1), 2, 3, 4, 5), - } - - self.d["float"] = Series(data["A"]) - self.d["int"] = Series(data["B"]) - self.d["mixed"] = Series(data["E"]) - self.d["dt_tz_mixed"] = Series(data["F"]) - self.d["dt_tz"] = Series(data["G"]) - self.d["cat_ordered"] = Series(data["H"]) - self.d["cat_unordered"] = Series(data["I"]) - self.d["numpy_bool_mixed"] = Series(data["J"]) - - def test_basic(self): - - # run multiple times here - for n in range(10): - for s, i in self.d.items(): - i_rec = self.encode_decode(i) - tm.assert_series_equal(i, i_rec) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestCategorical(TestPackers): - def setup_method(self, method): - super().setup_method(method) - - self.d = {} - - self.d["plain_str"] = Categorical(["a", "b", "c", "d", "e"]) - self.d["plain_str_ordered"] = Categorical( - ["a", "b", "c", "d", "e"], ordered=True - ) - - self.d["plain_int"] = Categorical([5, 6, 7, 8]) - self.d["plain_int_ordered"] = Categorical([5, 6, 7, 8], ordered=True) - - def test_basic(self): - - # run multiple times here - for n in range(10): - for s, i in self.d.items(): - i_rec = self.encode_decode(i) - tm.assert_categorical_equal(i, i_rec) - - -@pytest.mark.filterwarnings("ignore:msgpack:FutureWarning") -class TestNDFrame(TestPackers): - def setup_method(self, method): - super().setup_method(method) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, np.nan], - "B": [0, 1, 0, 1, 0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": date_range("1/1/2009", periods=5), - "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], - "F": [Timestamp("20130102", tz="US/Eastern")] * 5, - "G": [Timestamp("20130603", tz="CET")] * 5, - "H": Categorical(["a", "b", "c", "d", "e"]), - "I": Categorical(["a", "b", "c", "d", "e"], ordered=True), - } - - self.frame = { - "float": DataFrame(dict(A=data["A"], B=Series(data["A"]) + 1)), - "int": DataFrame(dict(A=data["B"], B=Series(data["B"]) + 1)), - "mixed": DataFrame(data), - } - - def test_basic_frame(self): - - for s, i in self.frame.items(): - i_rec = self.encode_decode(i) - tm.assert_frame_equal(i, i_rec) - - def test_multi(self): - - i_rec = self.encode_decode(self.frame) - for k in self.frame.keys(): - tm.assert_frame_equal(self.frame[k], i_rec[k]) - - packed_items = tuple( - [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None] - ) - l_rec = self.encode_decode(packed_items) - check_arbitrary(packed_items, l_rec) - - # this is an oddity in that packed lists will be returned as tuples - packed_items = [ - self.frame["float"], - self.frame["float"].A, - self.frame["float"].B, - None, - ] - l_rec = self.encode_decode(packed_items) - assert isinstance(l_rec, tuple) - check_arbitrary(packed_items, l_rec) - - def test_iterator(self): - - packed_items = [ - self.frame["float"], - self.frame["float"].A, - self.frame["float"].B, - None, - ] - - with tm.ensure_clean(self.path) as path: - to_msgpack(path, *packed_items) - for i, packed in enumerate(read_msgpack(path, iterator=True)): - check_arbitrary(packed, packed_items[i]) - - def tests_datetimeindex_freq_issue(self): - - # GH 5947 - # inferring freq on the datetimeindex - df = DataFrame([1, 2, 3], index=date_range("1/1/2013", "1/3/2013")) - result = self.encode_decode(df) - tm.assert_frame_equal(result, df) - - df = DataFrame([1, 2], index=date_range("1/1/2013", "1/2/2013")) - result = self.encode_decode(df) - tm.assert_frame_equal(result, df) - - def test_dataframe_duplicate_column_names(self): - - # GH 9618 - expected_1 = DataFrame(columns=["a", "a"]) - expected_2 = DataFrame(columns=[1] * 100) - expected_2.loc[0] = np.random.randn(100) - expected_3 = DataFrame(columns=[1, 1]) - expected_3.loc[0] = ["abc", np.nan] - - result_1 = self.encode_decode(expected_1) - result_2 = self.encode_decode(expected_2) - result_3 = self.encode_decode(expected_3) - - tm.assert_frame_equal(result_1, expected_1) - tm.assert_frame_equal(result_2, expected_2) - tm.assert_frame_equal(result_3, expected_3) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestCompression(TestPackers): - """See https://github.com/pandas-dev/pandas/pull/9783 - """ - - def setup_method(self, method): - try: - from sqlalchemy import create_engine - - self._create_sql_engine = create_engine - except ImportError: - self._SQLALCHEMY_INSTALLED = False - else: - self._SQLALCHEMY_INSTALLED = True - - super().setup_method(method) - data = { - "A": np.arange(1000, dtype=np.float64), - "B": np.arange(1000, dtype=np.int32), - "C": list(100 * "abcdefghij"), - "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), - "E": [datetime.timedelta(days=x) for x in range(1000)], - } - self.frame = { - "float": DataFrame({k: data[k] for k in ["A", "A"]}), - "int": DataFrame({k: data[k] for k in ["B", "B"]}), - "mixed": DataFrame(data), - } - - def test_plain(self): - i_rec = self.encode_decode(self.frame) - for k in self.frame.keys(): - tm.assert_frame_equal(self.frame[k], i_rec[k]) - - def _test_compression(self, compress): - i_rec = self.encode_decode(self.frame, compress=compress) - for k in self.frame.keys(): - value = i_rec[k] - expected = self.frame[k] - tm.assert_frame_equal(value, expected) - # make sure that we can write to the new frames - for block in value._data.blocks: - assert block.values.flags.writeable - - def test_compression_zlib(self): - if not _ZLIB_INSTALLED: - pytest.skip("no zlib") - self._test_compression("zlib") - - def test_compression_blosc(self): - if not _BLOSC_INSTALLED: - pytest.skip("no blosc") - self._test_compression("blosc") - - def _test_compression_warns_when_decompress_caches(self, monkeypatch, compress): - not_garbage = [] - control = [] # copied data - - compress_module = globals()[compress] - real_decompress = compress_module.decompress - - def decompress(ob): - """mock decompress function that delegates to the real - decompress but caches the result and a copy of the result. - """ - res = real_decompress(ob) - not_garbage.append(res) # hold a reference to this bytes object - control.append(bytearray(res)) # copy the data here to check later - return res - - # types mapped to values to add in place. - rhs = { - np.dtype("float64"): 1.0, - np.dtype("int32"): 1, - np.dtype("object"): "a", - np.dtype("datetime64[ns]"): np.timedelta64(1, "ns"), - np.dtype("timedelta64[ns]"): np.timedelta64(1, "ns"), - } - - with monkeypatch.context() as m, tm.assert_produces_warning( - PerformanceWarning - ) as ws: - m.setattr(compress_module, "decompress", decompress) - - with catch_warnings(): - filterwarnings("ignore", category=FutureWarning) - i_rec = self.encode_decode(self.frame, compress=compress) - for k in self.frame.keys(): - - value = i_rec[k] - expected = self.frame[k] - tm.assert_frame_equal(value, expected) - # make sure that we can write to the new frames even though - # we needed to copy the data - for block in value._data.blocks: - assert block.values.flags.writeable - # mutate the data in some way - block.values[0] += rhs[block.dtype] - - for w in ws: - # check the messages from our warnings - assert str(w.message) == ( - "copying data after decompressing; " - "this may mean that decompress is " - "caching its result" - ) - - for buf, control_buf in zip(not_garbage, control): - # make sure none of our mutations above affected the - # original buffers - assert buf == control_buf - - def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch): - if not _ZLIB_INSTALLED: - pytest.skip("no zlib") - self._test_compression_warns_when_decompress_caches(monkeypatch, "zlib") - - def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch): - if not _BLOSC_INSTALLED: - pytest.skip("no blosc") - self._test_compression_warns_when_decompress_caches(monkeypatch, "blosc") - - def _test_small_strings_no_warn(self, compress): - empty = np.array([], dtype="uint8") - with tm.assert_produces_warning(None): - with catch_warnings(): - filterwarnings("ignore", category=FutureWarning) - empty_unpacked = self.encode_decode(empty, compress=compress) - - tm.assert_numpy_array_equal(empty_unpacked, empty) - assert empty_unpacked.flags.writeable - - char = np.array([ord(b"a")], dtype="uint8") - with tm.assert_produces_warning(None): - with catch_warnings(): - filterwarnings("ignore", category=FutureWarning) - char_unpacked = self.encode_decode(char, compress=compress) - - tm.assert_numpy_array_equal(char_unpacked, char) - assert char_unpacked.flags.writeable - # if this test fails I am sorry because the interpreter is now in a - # bad state where b'a' points to 98 == ord(b'b'). - char_unpacked[0] = ord(b"b") - - # we compare the ord of bytes b'a' with unicode 'a' because the should - # always be the same (unless we were able to mutate the shared - # character singleton in which case ord(b'a') == ord(b'b'). - assert ord(b"a") == ord("a") - tm.assert_numpy_array_equal(char_unpacked, np.array([ord(b"b")], dtype="uint8")) - - def test_small_strings_no_warn_zlib(self): - if not _ZLIB_INSTALLED: - pytest.skip("no zlib") - self._test_small_strings_no_warn("zlib") - - def test_small_strings_no_warn_blosc(self): - if not _BLOSC_INSTALLED: - pytest.skip("no blosc") - self._test_small_strings_no_warn("blosc") - - def test_readonly_axis_blosc(self): - # GH11880 - if not _BLOSC_INSTALLED: - pytest.skip("no blosc") - df1 = DataFrame({"A": list("abcd")}) - df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) - assert 1 in self.encode_decode(df1["A"], compress="blosc") - assert 1.0 in self.encode_decode(df2["A"], compress="blosc") - - def test_readonly_axis_zlib(self): - # GH11880 - df1 = DataFrame({"A": list("abcd")}) - df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) - assert 1 in self.encode_decode(df1["A"], compress="zlib") - assert 1.0 in self.encode_decode(df2["A"], compress="zlib") - - def test_readonly_axis_blosc_to_sql(self): - # GH11880 - if not _BLOSC_INSTALLED: - pytest.skip("no blosc") - if not self._SQLALCHEMY_INSTALLED: - pytest.skip("no sqlalchemy") - expected = DataFrame({"A": list("abcd")}) - df = self.encode_decode(expected, compress="blosc") - eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql("test", eng, if_exists="append") - result = pandas.read_sql_table("test", eng, index_col="index") - result.index.names = [None] - tm.assert_frame_equal(expected, result) - - def test_readonly_axis_zlib_to_sql(self): - # GH11880 - if not _ZLIB_INSTALLED: - pytest.skip("no zlib") - if not self._SQLALCHEMY_INSTALLED: - pytest.skip("no sqlalchemy") - expected = DataFrame({"A": list("abcd")}) - df = self.encode_decode(expected, compress="zlib") - eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql("test", eng, if_exists="append") - result = pandas.read_sql_table("test", eng, index_col="index") - result.index.names = [None] - tm.assert_frame_equal(expected, result) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestEncoding(TestPackers): - def setup_method(self, method): - super().setup_method(method) - data = { - "A": ["\u2019"] * 1000, - "B": np.arange(1000, dtype=np.int32), - "C": list(100 * "abcdefghij"), - "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), - "E": [datetime.timedelta(days=x) for x in range(1000)], - "G": [400] * 1000, - } - self.frame = { - "float": DataFrame({k: data[k] for k in ["A", "A"]}), - "int": DataFrame({k: data[k] for k in ["B", "B"]}), - "mixed": DataFrame(data), - } - self.utf_encodings = ["utf8", "utf16", "utf32"] - - def test_utf(self): - # GH10581 - for encoding in self.utf_encodings: - for frame in self.frame.values(): - result = self.encode_decode(frame, encoding=encoding) - tm.assert_frame_equal(result, frame) - - def test_default_encoding(self): - for frame in self.frame.values(): - result = frame.to_msgpack() - expected = frame.to_msgpack(encoding="utf8") - assert result == expected - result = self.encode_decode(frame) - tm.assert_frame_equal(result, frame) - - -files = glob.glob( - os.path.join(os.path.dirname(__file__), "data", "legacy_msgpack", "*", "*.msgpack") -) - - -@pytest.fixture(params=files) -def legacy_packer(request, datapath): - return datapath(request.param) - - -@pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") -class TestMsgpack: - """ - How to add msgpack tests: - - 1. Install pandas version intended to output the msgpack. - 2. Execute "generate_legacy_storage_files.py" to create the msgpack. - $ python generate_legacy_storage_files.py msgpack - - 3. Move the created pickle to "data/legacy_msgpack/" directory. - """ - - minimum_structure = { - "series": ["float", "int", "mixed", "ts", "mi", "dup"], - "frame": ["float", "int", "mixed", "mi"], - "index": ["int", "date", "period"], - "mi": ["reg2"], - } - - def check_min_structure(self, data, version): - for typ, v in self.minimum_structure.items(): - - assert typ in data, '"{0}" not found in unpacked data'.format(typ) - for kind in v: - msg = '"{0}" not found in data["{1}"]'.format(kind, typ) - assert kind in data[typ], msg - - def compare(self, current_data, all_data, vf, version): - data = read_msgpack(vf) - - self.check_min_structure(data, version) - for typ, dv in data.items(): - assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format( - typ - ) - for dt, result in dv.items(): - assert ( - dt in current_data[typ] - ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt) - try: - expected = current_data[typ][dt] - except KeyError: - continue - - # use a specific comparator - # if available - comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt) - comparator = getattr(self, comp_method, None) - if comparator is not None: - comparator(result, expected, typ, version) - else: - check_arbitrary(result, expected) - - return data - - def compare_series_dt_tz(self, result, expected, typ, version): - tm.assert_series_equal(result, expected) - - def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): - tm.assert_frame_equal(result, expected) - - def test_msgpacks_legacy( - self, current_packers_data, all_packers_data, legacy_packer, datapath - ): - - version = os.path.basename(os.path.dirname(legacy_packer)) - - try: - with catch_warnings(record=True): - self.compare( - current_packers_data, all_packers_data, legacy_packer, version - ) - except ImportError: - # blosc not installed - pass - - def test_msgpack_period_freq(self): - # https://github.com/pandas-dev/pandas/issues/24135 - s = Series(np.random.rand(5), index=date_range("20130101", periods=5)) - r = read_msgpack(s.to_msgpack()) - repr(r) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index debc797fe6e88..d51c712ed5abd 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -10,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parquet import ( FastParquetImpl, @@ -405,7 +405,7 @@ def test_write_ignoring_index(self, engine): ["one", "two", "one", "two", "one", "two", "one", "two"], ] df = pd.DataFrame( - {"one": [i for i in range(8)], "two": [-i for i in range(8)]}, index=arrays + {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays ) expected = df.reset_index(drop=True) @@ -443,11 +443,12 @@ def test_duplicate_columns(self, pa): self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) - # pyarrow 0.11 raises ArrowTypeError - # older pyarrows raise ArrowInvalid - self.check_error_on_write(df, pa, Exception) + if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): + # period - will be supported using an extension type with pyarrow 1.0 + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) @@ -499,11 +500,32 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_partition_cols_string(self, pa, df_full): + # GH #27117 + partition_cols = "bool" + partition_cols_list = [partition_cols] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set(partition_cols_list) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() check_round_trip(df, pa) + def test_write_with_schema(self, pa): + import pyarrow + + df = pd.DataFrame({"x": [0, 1]}) + schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) + out_df = df.astype(bool) + check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) + @td.skip_if_no("pyarrow", min_version="0.15.0") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the @@ -514,18 +536,37 @@ def test_additional_extension_arrays(self, pa): "b": pd.Series(["a", None, "c"], dtype="string"), } ) - # currently de-serialized as plain int / object - expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values in integer, currently de-serialized as float - expected = df.assign(a=df.a.astype("float64")) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.1.dev"): + expected = df + else: + # if missing values in integer, currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType + df = pd.DataFrame( + { + # Arrow does not yet support struct in writing to Parquet (ARROW-1644) + # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), + "d": pd.period_range("2012-01-01", periods=3, freq="D"), + } + ) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): - @td.skip_if_no("fastparquet", min_version="0.2.1") + @td.skip_if_no("fastparquet", min_version="0.3.2") def test_basic(self, fp, df_full): df = df_full @@ -588,6 +629,23 @@ def test_partition_cols_supported(self, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 + def test_partition_cols_string(self, fp, df_full): + # GH #27117 + partition_cols = "bool" + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index edd0b09185e71..3d427dde573af 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -22,10 +22,11 @@ import pytest from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +import pandas.util._test_decorators as td import pandas as pd from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, MonthEnd @@ -202,23 +203,25 @@ def test_legacy_sparse_warning(datapath): Generated with >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [0, 0, 1, 1]}).to_sparse() - >>> df.to_pickle("pandas/tests/io/data/sparseframe-0.20.3.pickle.gz", + >>> df.to_pickle("pandas/tests/io/data/pickle/sparseframe-0.20.3.pickle.gz", ... compression="gzip") >>> s = df['B'] - >>> s.to_pickle("pandas/tests/io/data/sparseseries-0.20.3.pickle.gz", + >>> s.to_pickle("pandas/tests/io/data/pickle/sparseseries-0.20.3.pickle.gz", ... compression="gzip") """ with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseseries-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseseries-0.20.3.pickle.gz"), + compression="gzip", ) with tm.assert_produces_warning(FutureWarning): simplefilter("ignore", DeprecationWarning) # from boto pd.read_pickle( - datapath("io", "data", "sparseframe-0.20.3.pickle.gz"), compression="gzip" + datapath("io", "data", "pickle", "sparseframe-0.20.3.pickle.gz"), + compression="gzip", ) @@ -377,3 +380,110 @@ def test_read(self, protocol, get_random_path): df.to_pickle(path, protocol=protocol) df2 = pd.read_pickle(path) tm.assert_frame_equal(df, df2) + + +def test_unicode_decode_error(datapath): + # pickle file written with py27, should be readable without raising + # UnicodeDecodeError, see GH#28645 + path = datapath("io", "data", "pickle", "test_py27.pkl") + df = pd.read_pickle(path) + + # just test the columns are correct since the values are random + excols = pd.Index(["a", "b", "c"]) + tm.assert_index_equal(df.columns, excols) + + +# --------------------- +# tests for buffer I/O +# --------------------- + + +def test_pickle_buffer_roundtrip(): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + with open(path, "wb") as fh: + df.to_pickle(fh) + with open(path, "rb") as fh: + result = pd.read_pickle(fh) + tm.assert_frame_equal(df, result) + + +# --------------------- +# tests for URL I/O +# --------------------- + + +@pytest.mark.parametrize( + "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] +) +def test_pickle_generalurl_read(monkeypatch, mockurl): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + class MockReadResponse: + def __init__(self, path): + self.file = open(path, "rb") + if "gzip" in path: + self.headers = {"Content-Encoding": "gzip"} + else: + self.headers = {"Content-Encoding": None} + + def read(self): + return self.file.read() + + def close(self): + return self.file.close() + + with tm.ensure_clean() as path: + + def mock_urlopen_read(*args, **kwargs): + return MockReadResponse(path) + + df = tm.makeDataFrame() + python_pickler(df, path) + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) +def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockGCSFileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("s3fs") +@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) +def test_pickle_s3url_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockS3FileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ca84156d104fc..013f56f83c5ec 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -2,14 +2,14 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm pyreadstat = pytest.importorskip("pyreadstat") def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) @@ -23,7 +23,7 @@ def test_spss_labelled_num(datapath): def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-num-na.sav") + fname = datapath("io", "data", "spss", "labelled-num-na.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) @@ -37,7 +37,7 @@ def test_spss_labelled_num_na(datapath): def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "labelled-str.sav") + fname = datapath("io", "data", "spss", "labelled-str.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) @@ -51,7 +51,7 @@ def test_spss_labelled_str(datapath): def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "umlauts.sav") + fname = datapath("io", "data", "spss", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame( @@ -67,7 +67,7 @@ def test_spss_umlauts(datapath): def test_spss_usecols(datapath): # usecols must be list-like - fname = datapath("io", "data", "labelled-num.sav") + fname = datapath("io", "data", "spss", "labelled-num.sav") with pytest.raises(TypeError, match="usecols must be list-like."): pd.read_spss(fname, usecols="VAR00002") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 183a47c6039ec..45b3e839a08d1 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -41,7 +41,7 @@ to_datetime, to_timedelta, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.sql as sql from pandas.io.sql import read_sql_query, read_sql_table @@ -215,9 +215,7 @@ def teardown_method(self, method): class MySQLMixIn(MixInBase): def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute( - "DROP TABLE IF EXISTS {}".format(sql._get_valid_mysql_name(table_name)) - ) + cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_mysql_name(table_name)}") self.conn.commit() def _get_all_tables(self): @@ -237,7 +235,7 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): def drop_table(self, table_name): self.conn.execute( - "DROP TABLE IF EXISTS {}".format(sql._get_valid_sqlite_name(table_name)) + f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}" ) self.conn.commit() @@ -275,7 +273,7 @@ def _get_exec(self): else: return self.conn.cursor() - @pytest.fixture(params=[("io", "data", "iris.csv")]) + @pytest.fixture(params=[("data", "iris.csv")]) def load_iris_data(self, datapath, request): import io @@ -405,11 +403,7 @@ def _load_raw_sql(self): def _count_rows(self, table_name): result = ( self._get_exec() - .execute( - "SELECT count(*) AS count_1 FROM {table_name}".format( - table_name=table_name - ) - ) + .execute(f"SELECT count(*) AS count_1 FROM {table_name}") .fetchone() ) return result[0] @@ -583,7 +577,7 @@ class _TestSQLApi(PandasSQLTest): """ flavor = "sqlite" - mode = None # type: str + mode: str def setup_connect(self): self.conn = self.connect() @@ -1207,7 +1201,7 @@ def _get_sqlite_column_type(self, schema, column): for col in schema.split("\n"): if col.split()[0].strip('""') == column: return col.split()[1] - raise ValueError("Column {column} not found".format(column=column)) + raise ValueError(f"Column {column} not found") def test_sqlite_type_mapping(self): @@ -1234,7 +1228,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ - flavor = None # type: str + flavor: str @pytest.fixture(autouse=True, scope="class") def setup_class(cls): @@ -1272,7 +1266,7 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - pytest.skip("Can't connect to {0} server".format(self.flavor)) + pytest.skip(f"Can't connect to {self.flavor} server") def test_read_sql(self): self._read_sql_iris() @@ -1414,7 +1408,7 @@ def check(col): else: raise AssertionError( - "DateCol loaded with incorrect type -> {0}".format(col.dtype) + f"DateCol loaded with incorrect type -> {col.dtype}" ) # GH11216 @@ -2051,15 +2045,13 @@ def psql_insert_copy(table, conn, keys, data_iter): writer.writerows(data_iter) s_buf.seek(0) - columns = ", ".join('"{}"'.format(k) for k in keys) + columns = ", ".join(f'"{k}"' for k in keys) if table.schema: - table_name = "{}.{}".format(table.schema, table.name) + table_name = f"{table.schema}.{table.name}" else: table_name = table.name - sql_query = "COPY {} ({}) FROM STDIN WITH CSV".format( - table_name, columns - ) + sql_query = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV" cur.copy_expert(sql=sql_query, file=s_buf) expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) @@ -2199,14 +2191,12 @@ def test_datetime_time(self): def _get_index_columns(self, tbl_name): ixs = sql.read_sql_query( "SELECT * FROM sqlite_master WHERE type = 'index' " - + "AND tbl_name = '{tbl_name}'".format(tbl_name=tbl_name), + + f"AND tbl_name = '{tbl_name}'", self.conn, ) ix_cols = [] for ix_name in ixs.name: - ix_info = sql.read_sql_query( - "PRAGMA index_info({ix_name})".format(ix_name=ix_name), self.conn - ) + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) ix_cols.append(ix_info.name.tolist()) return ix_cols @@ -2217,15 +2207,11 @@ def test_transactions(self): self._transaction_test() def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute("PRAGMA table_info({table})".format(table=table)) + recs = self.conn.execute(f"PRAGMA table_info({table})") for cid, name, ctype, not_null, default, pk in recs: if name == column: return ctype - raise ValueError( - "Table {table}, column {column} not found".format( - table=table, column=column - ) - ) + raise ValueError(f"Table {table}, column {column} not found") def test_dtype(self): if self.flavor == "mysql": @@ -2295,7 +2281,7 @@ def test_illegal_names(self): sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = "test_weird_col_name{ndx:d}".format(ndx=ndx) + c_tbl = f"test_weird_col_name{ndx:d}" df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) @@ -2500,7 +2486,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) + sql_select = f"SELECT * FROM {table_name}" def clean_up(test_table_to_drop): """ @@ -2788,7 +2774,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) + sql_select = f"SELECT * FROM {table_name}" def clean_up(test_table_to_drop): """ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a0ec06a2197ae..1d3cddbf01738 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import datetime as dt from datetime import datetime import gzip @@ -13,8 +12,8 @@ from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd +import pandas._testing as tm from pandas.core.frame import DataFrame, Series -import pandas.util.testing as tm from pandas.io.parsers import read_csv from pandas.io.stata import ( @@ -22,13 +21,14 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, + StataWriter118, read_stata, ) @pytest.fixture def dirpath(datapath): - return datapath("io", "data") + return datapath("io", "data", "stata") @pytest.fixture @@ -42,7 +42,7 @@ def parsed_114(dirpath): class TestStata: @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath("io", "data") + self.dirpath = datapath("io", "data", "stata") self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") @@ -121,16 +121,6 @@ def test_read_empty_dta(self, version): empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) - def test_data_method(self): - # Minimal testing of legacy data method - with StataReader(self.dta1_114) as rdr: - with tm.assert_produces_warning(UserWarning): - parsed_114_data = rdr.data() - - with StataReader(self.dta1_114) as rdr: - parsed_114_read = rdr.read() - tm.assert_frame_equal(parsed_114_data, parsed_114_read) - @pytest.mark.parametrize("file", ["dta1_114", "dta1_117"]) def test_read_dta1(self, file): @@ -383,8 +373,7 @@ def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) - with tm.assert_produces_warning(FutureWarning): - encoded = read_stata(self.dta_encoding, encoding="latin-1") + encoded = read_stata(self.dta_encoding) result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -392,10 +381,7 @@ def test_encoding(self, version): assert isinstance(result, str) with tm.ensure_clean() as path: - with tm.assert_produces_warning(FutureWarning): - encoded.to_stata( - path, write_index=False, version=version, encoding="latin-1" - ) + encoded.to_stata(path, write_index=False, version=version) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) @@ -1033,7 +1019,7 @@ def test_categorical_order(self, file): cols.append((col, pd.Categorical.from_codes(codes, labels))) else: cols.append((col, pd.Series(labels, dtype=np.float32))) - expected = DataFrame.from_dict(OrderedDict(cols)) + expected = DataFrame.from_dict(dict(cols)) # Read with and with out categoricals, ensure order is identical file = getattr(self, file) @@ -1286,11 +1272,9 @@ def test_invalid_variable_labels(self, version): variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: - msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): original.to_stata( path, variable_labels=variable_labels, version=version ) @@ -1312,8 +1296,8 @@ def test_write_variable_label_errors(self): } msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" + "Variable labels must contain only characters that can be " + "encoded in Latin-1" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1440,8 +1424,8 @@ def test_out_of_range_double(self): } ) msg = ( - r"Column ColumnTooBig has a maximum value \(.+\)" - r" outside the range supported by Stata \(.+\)" + r"Column ColumnTooBig has a maximum value \(.+\) outside the range " + r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1449,8 +1433,8 @@ def test_out_of_range_double(self): df.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which is outside " + "the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1483,8 +1467,8 @@ def test_out_of_range_float(self): original.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which " + "is outside the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1721,15 +1705,7 @@ def test_all_none_exception(self, version): output = pd.DataFrame(output) output.loc[:, "none"] = None with tm.ensure_clean() as path: - msg = ( - r"Column `none` cannot be exported\.\n\n" - "Only string-like object arrays containing all strings or a" - r" mix of strings and None can be exported\. Object arrays" - r" containing only null values are prohibited\. Other" - " object typescannot be exported and must first be" - r" converted to one of the supported types\." - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) @pytest.mark.parametrize("version", [114, 117]) @@ -1793,3 +1769,41 @@ def test_stata_119(self): assert df.iloc[0, 7] == 3.14 assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + + def test_118_writer(self): + cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) + data = pd.DataFrame( + [ + [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"], + [2.0, 2, "ᴮ", ""], + [3.0, 3, "ᴰ", None], + ], + columns=["a", "β", "ĉ", "strls"], + ) + data["ᴐᴬᵀ"] = cat + variable_labels = { + "a": "apple", + "β": "ᵈᵉᵊ", + "ĉ": "ᴎტჄႲႳႴႶႺ", + "strls": "Long Strings", + "ᴐᴬᵀ": "", + } + data_label = "ᴅaᵀa-label" + data["β"] = data["β"].astype(np.int32) + with tm.ensure_clean() as path: + writer = StataWriter118( + path, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + ) + writer.write_file() + reread_encoded = read_stata(path) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + tm.assert_frame_equal(data, reread_encoded) + reader = StataReader(path) + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 86cb7fc57b225..9f43027836eb4 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm """ diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 41b1a88b15acb..9025f8c361a82 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -9,7 +9,7 @@ import pandas dummy_backend = types.ModuleType("pandas_dummy_backend") -setattr(dummy_backend, "plot", lambda *args, **kwargs: None) +setattr(dummy_backend, "plot", lambda *args, **kwargs: "used_dummy") @pytest.fixture @@ -38,6 +38,14 @@ def test_backend_is_correct(monkeypatch, restore_backend): ) +def test_backend_can_be_set_in_plot_call(monkeypatch, restore_backend): + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + df = pandas.DataFrame([1, 2, 3]) + + assert pandas.get_option("plotting.backend") == "matplotlib" + assert df.plot(backend="pandas_dummy_backend") == "used_dummy" + + @td.skip_if_no_mpl def test_register_entrypoint(restore_backend): @@ -86,3 +94,11 @@ def test_setting_backend_without_plot_raises(): def test_no_matplotlib_ok(): with pytest.raises(ImportError): pandas.plotting._core._get_plot_backend("matplotlib") + + +def test_extra_kinds_ok(monkeypatch, restore_backend): + # https://github.com/pandas-dev/pandas/pull/28647 + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + pandas.set_option("plotting.backend", "pandas_dummy_backend") + df = pandas.DataFrame({"A": [1, 2, 3]}) + df.plot(kind="not a real kind") diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 116d924f5a596..8ee279f0e1f38 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index ccc2afbb8b824..9cd3ccbf9214e 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import np_datetime64_compat from pandas import Index, Period, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.plotting import ( deregister_matplotlib_converters, @@ -22,7 +22,7 @@ from pandas.plotting._matplotlib import converter except ImportError: # try / except, rather than skip, to avoid internal refactoring - # causing an improprer skip + # causing an improper skip pass pytest.importorskip("matplotlib.pyplot") @@ -66,11 +66,10 @@ def test_registering_no_warning(self): # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 DeprecationWarning from 2D indexing ax.plot(s.index, s.values) - assert len(w) == 0 - def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) @@ -84,7 +83,7 @@ def test_matplotlib_formatters(self): units = pytest.importorskip("matplotlib.units") # Can't make any assertion about the start state. - # We we check that toggling converters off remvoes it, and toggling it + # We we check that toggling converters off removes it, and toggling it # on restores it. with cf.option_context("plotting.matplotlib.register_converters", True): @@ -101,19 +100,16 @@ def test_option_no_warning(self): # Test without registering first, no warning with ctx: - with tm.assert_produces_warning(None) as w: + # GH#30588 DeprecationWarning from 2D indexing on Index + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - # Now test with registering register_matplotlib_converters() with ctx: - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - def test_registry_resets(self): units = pytest.importorskip("matplotlib.units") dates = pytest.importorskip("matplotlib.dates") @@ -139,15 +135,6 @@ def test_registry_resets(self): for k, v in original.items(): units.registry[k] = v - def test_old_import_warns(self): - with tm.assert_produces_warning(FutureWarning) as w: - from pandas.tseries import converter - - converter.register() - - assert len(w) - assert "pandas.plotting.register_matplotlib_converters" in str(w[0].message) - class TestDateTimeConverter: def setup_method(self, method): @@ -277,7 +264,7 @@ def _assert_less(ts1, ts2): val1 = self.dtc.convert(ts1, None, None) val2 = self.dtc.convert(ts2, None, None) if not val1 < val2: - raise AssertionError("{0} is not less than {1}.".format(val1, val2)) + raise AssertionError(f"{val1} is not less than {val2}.") # Matplotlib's time representation using floats cannot distinguish # intervals smaller than ~10 microsecond in the common range of years. diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 973bda8292b2a..8f855fd0c6cff 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -9,12 +9,12 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, NaT, Series, isna +import pandas._testing as tm from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import DatetimeIndex from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm from pandas.tseries.offsets import DateOffset @@ -99,33 +99,12 @@ def test_nonnumeric_exclude(self): with pytest.raises(TypeError, match=msg): df["A"].plot() - def test_tsplot_deprecated(self): - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - ts = tm.makeTimeSeries() - - with tm.assert_produces_warning(FutureWarning): - tsplot(ts, self.plt.Axes.plot, ax=ax) - @pytest.mark.slow def test_tsplot(self): - from pandas.tseries.plotting import tsplot - _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - def f(*args, **kwds): - with tm.assert_produces_warning(FutureWarning): - return tsplot(s, self.plt.Axes.plot, *args, **kwds) - - for s in self.period_ser: - _check_plot_works(f, s.index.freq, ax=ax, series=s) - - for s in self.datetime_ser: - _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) - for s in self.period_ser: _check_plot_works(s.plot, ax=ax) @@ -194,17 +173,6 @@ def check_format_of_first_point(ax, expected_string): check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() - # tsplot - from pandas.tseries.plotting import tsplot - - _, ax = self.plt.subplots() - with tm.assert_produces_warning(FutureWarning): - tsplot(annual, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014 y = 1.000000") - with tm.assert_produces_warning(FutureWarning): - tsplot(daily, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") - @pytest.mark.slow def test_line_plot_period_series(self): for s in self.period_ser: @@ -892,16 +860,6 @@ def test_to_weekly_resampling(self): for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(high, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(low, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - @pytest.mark.slow def test_from_weekly_resampling(self): idxh = date_range("1/1/1999", periods=52, freq="W") @@ -926,21 +884,6 @@ def test_from_weekly_resampling(self): tm.assert_numpy_array_equal(xdata, expected_h) tm.close() - _, ax = self.plt.subplots() - from pandas.tseries.plotting import tsplot - - with tm.assert_produces_warning(FutureWarning): - tsplot(low, self.plt.Axes.plot, ax=ax) - with tm.assert_produces_warning(FutureWarning): - lines = tsplot(high, self.plt.Axes.plot, ax=ax) - for l in lines: - assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq - xdata = l.get_xdata(orig=False) - if len(xdata) == 12: # idxl lines - tm.assert_numpy_array_equal(xdata, expected_l) - else: - tm.assert_numpy_array_equal(xdata, expected_h) - @pytest.mark.slow def test_from_resampling_area_line_mixed(self): idxh = date_range("1/1/1999", periods=52, freq="W") @@ -1409,7 +1352,7 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in np.arange(10)] + expected_labels = [f"00:00:00.0000000{i:0>2d}" for i in np.arange(10)] rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index fd66888fc30e4..1c429bafa9a19 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -17,9 +17,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting @@ -555,14 +555,14 @@ def test_subplots_timeseries_y_axis_not_supported(self): period: since period isn't yet implemented in ``select_dtypes`` and because it will need a custom value converter + - tick formater (as was done for x-axis plots) + tick formatter (as was done for x-axis plots) categorical: because it will need a custom value converter + - tick formater (also doesn't work for x-axis, as of now) + tick formatter (also doesn't work for x-axis, as of now) datetime_mixed_tz: - because of the way how pandas handels ``Series`` of + because of the way how pandas handles ``Series`` of ``datetime`` objects with different timezone, generally converting ``datetime`` objects in a tz-aware form could help with this problem @@ -1162,6 +1162,36 @@ def test_plot_scatter(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + def test_raise_error_on_datetime_time_data(self): + # GH 8113, datetime.time type is not supported by matplotlib in scatter + df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a number, not 'datetime.time'" + + with pytest.raises(TypeError, match=msg): + df.plot(kind="scatter", x="dtime", y="a") + + def test_scatterplot_datetime_data(self): + # GH 30391 + dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") + vals = np.random.normal(0, 1, len(dates)) + df = pd.DataFrame({"dates": dates, "vals": vals}) + + _check_plot_works(df.plot.scatter, x="dates", y="vals") + _check_plot_works(df.plot.scatter, x=0, y=1) + + def test_scatterplot_object_data(self): + # GH 18755 + df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + @pytest.mark.slow def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # addressing issue #10611, to ensure colobar does not @@ -1216,24 +1246,15 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) @pytest.mark.slow - def test_plot_scatter_with_categorical_data(self): - # GH 16199 + def test_plot_scatter_with_categorical_data(self, x, y): + # after fixing GH 18755, should be able to plot categorical data df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) - with pytest.raises(ValueError) as ve: - df.plot(x="x", y="y", kind="scatter") - ve.match("requires y column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="x", kind="scatter") - ve.match("requires x column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="y", kind="scatter") - ve.match("requires x column to be numeric") + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.slow def test_plot_scatter_with_c(self): @@ -3250,6 +3271,34 @@ def test_plot_no_numeric_data(self): with pytest.raises(TypeError): df.plot() + def test_missing_markers_legend(self): + # 14958 + df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) + ax = df.plot(y=["A"], marker="x", linestyle="solid") + df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) + df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=["x", "o", "<"]) + + def test_missing_markers_legend_using_style(self): + # 14563 + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6], + "B": [2, 4, 1, 3, 2, 4], + "C": [3, 3, 2, 6, 4, 2], + "X": [1, 2, 3, 4, 5, 6], + } + ) + + fig, ax = self.plt.subplots() + for kind in "ABC": + df.plot("X", kind, label=kind, ax=ax, style=".") + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index bb1747710fe18..8fec4bb134cb4 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm @td.skip_if_no_mpl diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 14cb2bc9d7b62..50ebbc22f2739 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -9,8 +9,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm @td.skip_if_no_mpl @@ -253,6 +253,24 @@ def test_tight_layout(self): tm.close() + def test_hist_subplot_xrot(self): + # GH 30288 + df = DataFrame( + { + "length": [1.5, 0.5, 1.2, 0.9, 3], + "animal": ["pig", "rabbit", "pig", "pig", "rabbit"], + } + ) + axes = _check_plot_works( + df.hist, + filterwarnings="always", + column="length", + by="animal", + bins=5, + xrot=0, + ) + self._check_ticks_props(axes, xrot=0) + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): @@ -313,7 +331,8 @@ def test_grouped_hist_legacy(self): with pytest.raises(AttributeError): _grouped_hist(df.A, by=df.C, foo="bar") - with tm.assert_produces_warning(FutureWarning): + msg = "Specify figure size by tuple instead" + with pytest.raises(ValueError, match=msg): df.hist(by="C", figsize="default") @pytest.mark.slow diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 940cfef4058e0..c8aa1f23ccf1f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting @@ -32,14 +32,9 @@ def test_get_accessor_args(): with pytest.raises(TypeError, match=msg): func(backend_name="", data=[], args=[], kwargs={}) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - x, y, kind, kwargs = func( - backend_name="", data=Series(), args=["line", None], kwargs={} - ) - assert x is None - assert y is None - assert kind == "line" - assert kwargs == {"ax": None} + msg = "should not be called with positional arguments" + with pytest.raises(TypeError, match=msg): + func(backend_name="", data=Series(dtype=object), args=["line", None], kwargs={}) x, y, kind, kwargs = func( backend_name="", @@ -53,7 +48,10 @@ def test_get_accessor_args(): assert kwargs == {"grid": False} x, y, kind, kwargs = func( - backend_name="pandas.plotting._matplotlib", data=Series(), args=[], kwargs={} + backend_name="pandas.plotting._matplotlib", + data=Series(dtype=object), + args=[], + kwargs={}, ) assert x is None assert y is None @@ -204,9 +202,6 @@ def test_andrews_curves(self, iris): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) - with tm.assert_produces_warning(FutureWarning): - andrews_curves(data=df, class_column="Name") - @pytest.mark.slow def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates @@ -253,11 +248,6 @@ def test_parallel_coordinates(self, iris): handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(data=df, class_column="Name") - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(df, "Name", colors=colors) - # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") def test_parallel_coordinates_with_sorted_labels(self): @@ -266,7 +256,7 @@ def test_parallel_coordinates_with_sorted_labels(self): df = DataFrame( { - "feat": [i for i in range(30)], + "feat": list(range(30)), "class": [2 for _ in range(10)] + [3 for _ in range(10)] + [1 for _ in range(10)], @@ -279,8 +269,7 @@ def test_parallel_coordinates_with_sorted_labels(self): ) ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) prev_next_tupels = zip( - [i for i in ordered_color_label_tuples[0:-1]], - [i for i in ordered_color_label_tuples[1:]], + list(ordered_color_label_tuples[0:-1]), list(ordered_color_label_tuples[1:]) ) for prev, nxt in prev_next_tupels: # labels and colors are ordered strictly increasing diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 89259cbb6c62d..8463f30bee8f0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -14,8 +14,8 @@ import pandas as pd from pandas import DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting @@ -330,7 +330,7 @@ def test_pie_series(self): ax = _check_plot_works( series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 ) - pcts = ["{0:.2f}".format(s * 100) for s in series.values / float(series.sum())] + pcts = [f"{s*100:.2f}" for s in series.values / float(series.sum())] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: @@ -865,15 +865,15 @@ def test_time_series_plot_color_with_empty_kwargs(self): def test_xticklabels(self): # GH11529 - s = Series(np.arange(10), index=["P{i:02d}".format(i=i) for i in range(10)]) + s = Series(np.arange(10), index=[f"P{i:02d}" for i in range(10)]) _, ax = self.plt.subplots() ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) - exp = ["P{i:02d}".format(i=i) for i in [0, 3, 5, 9]] + exp = [f"P{i:02d}" for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) def test_xtick_barPlot(self): # GH28172 - s = pd.Series(range(10), index=["P{i:02d}".format(i=i) for i in range(10)]) + s = pd.Series(range(10), index=[f"P{i:02d}" for i in range(10)]) ax = s.plot.bar(xticks=range(0, 11, 2)) exp = np.array(list(range(0, 11, 2))) tm.assert_numpy_array_equal(exp, ax.get_xticks()) @@ -931,3 +931,8 @@ def test_plot_no_numeric_data(self): df = pd.Series(["a", "b", "c"]) with pytest.raises(TypeError): df.plot() + + def test_style_single_ok(self): + s = pd.Series([1, 2]) + ax = s.plot(style="s", color="C3") + assert ax.lines[0].get_color() == ["C3"] diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 05ebff4387908..7400b049961d5 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -21,8 +21,8 @@ timedelta_range, to_timedelta, ) +import pandas._testing as tm from pandas.core import nanops -import pandas.util.testing as tm def get_objs(): @@ -79,7 +79,7 @@ def test_nanops(self): assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) - obj = klass([]) + obj = klass([], dtype=object) assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) @@ -179,8 +179,8 @@ class TestIndexReductions: [ (0, 400, 3), (500, 0, -6), - (-10 ** 6, 10 ** 6, 4), - (10 ** 6, -10 ** 6, -4), + (-(10 ** 6), 10 ** 6, 4), + (10 ** 6, -(10 ** 6), -4), (0, 10, 20), ], ) @@ -299,12 +299,6 @@ def test_timedelta_ops(self): result = td.to_frame().std() assert result[0] == expected - # invalid ops - for op in ["skew", "kurt", "sem", "prod"]: - msg = "reduction operation '{}' not allowed for this dtype" - with pytest.raises(TypeError, match=msg.format(op)): - getattr(td, op)() - # GH#10040 # make sure NaT is properly handled by median() s = Series([Timestamp("2015-02-03"), Timestamp("2015-02-07")]) @@ -315,6 +309,22 @@ def test_timedelta_ops(self): ) assert s.diff().median() == timedelta(days=6) + @pytest.mark.parametrize("opname", ["skew", "kurt", "sem", "prod", "var"]) + def test_invalid_td64_reductions(self, opname): + s = Series( + [Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)] + ) + td = s.diff() + + msg = "reduction operation '{op}' not allowed for this dtype" + msg = msg.format(op=opname) + + with pytest.raises(TypeError, match=msg): + getattr(td, opname)() + + with pytest.raises(TypeError, match=msg): + getattr(td.to_frame(), opname)(numeric_only=False) + def test_minmax_tz(self, tz_naive_fixture): tz = tz_naive_fixture # monotonic @@ -518,7 +528,7 @@ def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 # Entirely empty - s = Series([]) + s = Series([], dtype=object) # NA by default result = getattr(s, method)() assert result == unit @@ -636,8 +646,13 @@ def test_ops_consistency_on_empty(self, method): assert pd.isna(result) # timedelta64[ns] - result = getattr(Series(dtype="m8[ns]"), method)() - assert result is pd.NaT + tdser = Series([], dtype="m8[ns]") + if method == "var": + with pytest.raises(TypeError, match="operation 'var' not allowed"): + getattr(tdser, method)() + else: + result = getattr(tdser, method)() + assert result is pd.NaT def test_nansum_buglet(self): ser = Series([1.0, np.nan], index=[0, 1]) @@ -680,52 +695,40 @@ def test_empty_timeseries_reductions_return_nat(self): assert Series([], dtype=dtype).min(skipna=False) is pd.NaT assert Series([], dtype=dtype).max(skipna=False) is pd.NaT - def test_numpy_argmin_deprecated(self): + def test_numpy_argmin(self): # See GH#16830 data = np.arange(1, 11) s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmin also causes a deprecation - # warning when calling np.argmin. This behavior is temporary - # until the implementation of Series.argmin is corrected. - result = np.argmin(s) + result = np.argmin(s) - assert result == 1 + expected = np.argmin(data) + assert result == expected - with tm.assert_produces_warning(FutureWarning): - # argmin is aliased to idxmin - result = s.argmin() + result = s.argmin() - assert result == 1 + assert result == expected - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmin(s, out=data) + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmin(s, out=data) - def test_numpy_argmax_deprecated(self): + def test_numpy_argmax(self): # See GH#16830 data = np.arange(1, 11) s = Series(data, index=data) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # The deprecation of Series.argmax also causes a deprecation - # warning when calling np.argmax. This behavior is temporary - # until the implementation of Series.argmax is corrected. - result = np.argmax(s) - assert result == 10 + result = np.argmax(s) + expected = np.argmax(data) + assert result == expected - with tm.assert_produces_warning(FutureWarning): - # argmax is aliased to idxmax - result = s.argmax() + result = s.argmax() - assert result == 10 + assert result == expected - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.argmax(s, out=data) + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.argmax(s, out=data) def test_idxmin(self): # test idxmin @@ -885,7 +888,7 @@ def test_timedelta64_analytics(self): @pytest.mark.parametrize( "test_input,error_type", [ - (pd.Series([]), ValueError), + (pd.Series([], dtype="float64"), ValueError), # For strings, or any Series with dtype 'O' (pd.Series(["foo", "bar", "baz"]), TypeError), (pd.Series([(1,), (2,)]), TypeError), @@ -1028,7 +1031,7 @@ def test_min_max(self): ) _min = cat.min() _max = cat.max() - assert np.isnan(_min) + assert _min == "c" assert _max == "b" cat = Series( @@ -1038,30 +1041,24 @@ def test_min_max(self): ) _min = cat.min() _max = cat.max() - assert np.isnan(_min) + assert _min == 2 assert _max == 1 - def test_min_max_numeric_only(self): - # TODO deprecate numeric_only argument for Categorical and use - # skipna as well, see GH25303 + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_skipna(self, skipna): + # GH 25303 cat = Series( Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) ) + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "a" - - _min = cat.min(numeric_only=True) - _max = cat.max(numeric_only=True) - assert _min == "b" - assert _max == "a" - - _min = cat.min(numeric_only=False) - _max = cat.max(numeric_only=False) - assert np.isnan(_min) - assert _max == "a" + if skipna is True: + assert _min == "b" + assert _max == "a" + else: + assert np.isnan(_min) + assert np.isnan(_max) class TestSeriesMode: diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 432811b5a8264..59dbcb9ab9fa0 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray -import pandas.util.testing as tm class TestDatetimeLikeStatReductions: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index dc72800227c0e..f8a1810e66219 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -5,12 +5,12 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -import pandas.util.testing as tm # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -84,8 +84,8 @@ def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Index'" ) with pytest.raises(TypeError, match=msg): xp.resample("A").mean() @@ -112,6 +112,22 @@ def test_resample_empty_series(freq, empty_series, resample_method): tm.assert_series_equal(result, expected, check_dtype=False) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +@pytest.mark.parametrize("resample_method", ["count", "size"]) +def test_resample_count_empty_series(freq, empty_series, resample_method): + # GH28427 + result = getattr(empty_series.resample(freq), resample_method)() + + if isinstance(empty_series.index, PeriodIndex): + index = empty_series.index.asfreq(freq=freq) + else: + index = empty_series.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index, name=empty_series.name) + + tm.assert_series_equal(result, expected) + + @all_ts @pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_dataframe(empty_frame, freq, resample_method): @@ -123,7 +139,7 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): expected = df.copy() else: # GH14962 - expected = Series([]) + expected = Series([], dtype=object) if isinstance(df.index, PeriodIndex): expected.index = df.index.asfreq(freq=freq) @@ -136,6 +152,44 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): # test size for GH13212 (currently stays as df) +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_count_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).count() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.DataFrame({"a": []}, dtype="int64", index=index) + + tm.assert_frame_equal(result, expected) + + +@all_ts +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_resample_size_empty_dataframe(freq, empty_frame): + # GH28427 + + empty_frame = empty_frame.copy() + empty_frame["a"] = [] + + result = empty_frame.resample(freq).size() + + if isinstance(empty_frame.index, PeriodIndex): + index = empty_frame.index.asfreq(freq=freq) + else: + index = empty_frame.index._shallow_copy(freq=freq) + expected = pd.Series([], dtype="int64", index=index) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) @pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): @@ -153,7 +207,8 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -def test_resample_loffset_arg_type(frame, create_index): +@pytest.mark.parametrize("arg", ["mean", {"value": "mean"}, ["mean"]]) +def test_resample_loffset_arg_type(frame, create_index, arg): # GH 13218, 15002 df = frame expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] @@ -166,26 +221,18 @@ def test_resample_loffset_arg_type(frame, create_index): expected_index += timedelta(hours=2) expected = DataFrame({"value": expected_means}, index=expected_index) - for arg in ["mean", {"value": "mean"}, ["mean"]]: + result_agg = df.resample("2D", loffset="2H").agg(arg) - result_agg = df.resample("2D", loffset="2H").agg(arg) + if isinstance(arg, list): + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result_how = df.resample("2D", how=arg, loffset="2H") - - if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) - - # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex - if isinstance(expected.index, TimedeltaIndex): - msg = "DataFrame are different" - with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(result_agg, expected) - with pytest.raises(AssertionError, match=msg): - tm.assert_frame_equal(result_how, expected) - else: + # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex + if isinstance(expected.index, TimedeltaIndex): + msg = "DataFrame are different" + with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(result_agg, expected) - tm.assert_frame_equal(result_how, expected) + else: + tm.assert_frame_equal(result_agg, expected) @all_ts diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index a29f910261b58..4860329718f54 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets from pandas.tseries.offsets import BDay, Minute @@ -146,9 +146,7 @@ def test_resample_basic_grouper(series): def test_resample_string_kwargs(series, keyword, value): # see gh-19303 # Check that wrong keyword argument strings raise an error - msg = "Unsupported value {value} for `{keyword}`".format( - value=value, keyword=keyword - ) + msg = f"Unsupported value {value} for `{keyword}`" with pytest.raises(ValueError, match=msg): series.resample("5min", **({keyword: value})) @@ -1431,10 +1429,11 @@ def test_downsample_across_dst_weekly(): tm.assert_frame_equal(result, expected) idx = pd.date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H") - s = Series(index=idx) + s = Series(index=idx, dtype=np.float64) result = s.resample("W").mean() expected = Series( - index=pd.date_range("2013-04-07", freq="W", periods=5, tz="Europe/London") + index=pd.date_range("2013-04-07", freq="W", periods=5, tz="Europe/London"), + dtype=np.float64, ) tm.assert_series_equal(result, expected) @@ -1565,3 +1564,20 @@ def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): result = _get_timestamp_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected + + +def test_resample_apply_product(): + # GH 5586 + index = date_range(start="2012-01-31", freq="M", periods=12) + + ts = Series(range(12), index=index) + df = DataFrame(dict(A=ts, B=ts + 2)) + result = df.resample("Q").apply(np.product) + expected = DataFrame( + np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), + index=DatetimeIndex( + ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC" + ), + columns=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 93ce7a9480b35..955f8c7482937 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.resample import _get_period_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets @@ -82,9 +82,9 @@ def test_selection(self, index, freq, kind, kwargs): index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), ) msg = ( - "Resampling from level= or on= selection with a PeriodIndex is" - r" not currently supported, use \.set_index\(\.\.\.\) to" - " explicitly set index" + "Resampling from level= or on= selection with a PeriodIndex is " + r"not currently supported, use \.set_index\(\.\.\.\) to " + "explicitly set index" ) with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs) @@ -130,8 +130,8 @@ def test_not_subperiod(self, simple_period_range_series, rule, expected_error_ms # These are incompatible period rules for resampling ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") msg = ( - "Frequency cannot be resampled to {}, as they" - " are not sub or super periods" + "Frequency cannot be resampled to {}, as they " + "are not sub or super periods" ).format(expected_error_msg) with pytest.raises(IncompatibleFrequency, match=msg): ts.resample(rule).mean() @@ -236,8 +236,8 @@ def test_resample_same_freq(self, resample_method): def test_resample_incompat_freq(self): msg = ( - "Frequency cannot be resampled to ," - " as they are not sub or super periods" + "Frequency cannot be resampled to , " + "as they are not sub or super periods" ) with pytest.raises(IncompatibleFrequency, match=msg): Series( @@ -594,7 +594,7 @@ def test_resample_with_dst_time_change(self): def test_resample_bms_2752(self): # GH2753 - foo = Series(index=pd.bdate_range("20000101", "20000201")) + foo = Series(index=pd.bdate_range("20000101", "20000201"), dtype=np.float64) res1 = foo.resample("BMS").mean() res2 = foo.resample("BMS").mean().resample("B").mean() assert res1.index[0] == Timestamp("20000103") @@ -732,12 +732,9 @@ def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): expected = DataFrame({"value": expected_means}, index=expected_index) result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result_how = df.resample("2D", how=agg_arg, loffset="2H", kind=kind) if isinstance(agg_arg, list): expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) tm.assert_frame_equal(result_agg, expected) - tm.assert_frame_equal(result_how, expected) @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) @pytest.mark.parametrize("kind", [None, "period"]) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index cbdfbb7a3100b..170201b4f8e5c 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") @@ -179,7 +179,7 @@ def test_downsample_but_actually_upsampling(): def test_combined_up_downsampling_of_irregular(): - # since we are reallydoing an operation like this + # since we are really doing an operation like this # ts2.resample('2s').mean().ffill() # preserve these semantics @@ -187,9 +187,49 @@ def test_combined_up_downsampling_of_irregular(): ts = Series(np.arange(len(rng)), index=rng) ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ts2.resample("2s", how="mean", fill_method="ffill") - expected = ts2.resample("2s").mean().ffill() + result = ts2.resample("2s").mean().ffill() + expected = Series( + [ + 0.5, + 2.5, + 5.0, + 7.0, + 7.0, + 11.0, + 11.0, + 15.0, + 16.0, + 16.0, + 16.0, + 16.0, + 25.0, + 25.0, + 25.0, + 30.0, + ], + index=pd.DatetimeIndex( + [ + "2012-01-01 00:00:00", + "2012-01-01 00:00:02", + "2012-01-01 00:00:04", + "2012-01-01 00:00:06", + "2012-01-01 00:00:08", + "2012-01-01 00:00:10", + "2012-01-01 00:00:12", + "2012-01-01 00:00:14", + "2012-01-01 00:00:16", + "2012-01-01 00:00:18", + "2012-01-01 00:00:20", + "2012-01-01 00:00:22", + "2012-01-01 00:00:24", + "2012-01-01 00:00:26", + "2012-01-01 00:00:28", + "2012-01-01 00:00:30", + ], + dtype="datetime64[ns]", + freq="2S", + ), + ) tm.assert_series_equal(result, expected) @@ -247,10 +287,9 @@ def test_agg_consistency(): r = df.resample("3T") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = r[["A", "B", "C"]].agg({"r1": "mean", "r2": "sum"}) - result = r.agg({"r1": "mean", "r2": "sum"}) - tm.assert_frame_equal(result, expected, check_like=True) + msg = "nested renamer is not supported" + with pytest.raises(pd.core.base.SpecificationError, match=msg): + r.agg({"r1": "mean", "r2": "sum"}) # TODO: once GH 14008 is fixed, move these tests into @@ -307,26 +346,23 @@ def test_agg(): result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) - expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] ) for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.aggregate( + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( @@ -383,12 +419,10 @@ def test_agg_misc(): [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] ) + msg = "nested renamer is not supported" for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( - OrderedDict([("result1", np.sum), ("result2", np.mean)]) - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) # agg with different hows expected = pd.concat( @@ -408,21 +442,11 @@ def test_agg_misc(): # series like aggs for t in cases: - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat([t["A"].sum(), t["A"].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std")]) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"]}) - expected = pd.concat( - [t["A"].agg(["sum", "std"]), t["A"].agg(["mean", "std"])], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] - ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification @@ -451,28 +475,20 @@ def test_agg_nested_dicts(): df.groupby(pd.Grouper(freq="2D")), ] - msg = r"cannot perform renaming for r(1|2) with a nested dictionary" + msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) for t in cases: - expected = pd.concat( - [t["A"].mean(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 - ) - expected.columns = pd.MultiIndex.from_tuples( - [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] - ) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t[["A", "B"]].agg( + + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(pd.core.base.SpecificationError, match=msg): + t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_try_aggregate_non_existing_column(): @@ -503,8 +519,8 @@ def test_selection_api_validation(): # non DatetimeIndex msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Int64Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Int64Index'" ) with pytest.raises(TypeError, match=msg): df.resample("2D", level="v") @@ -523,8 +539,8 @@ def test_selection_api_validation(): # upsampling not allowed msg = ( - "Upsampling from level= or on= selection is not supported, use" - r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like" + "Upsampling from level= or on= selection is not supported, use " + r"\.set_index\(\.\.\.\) to explicitly set index to datetime-like" ) with pytest.raises(ValueError, match=msg): df.resample("2D", level="d").asfreq() diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 7efc6b0d466b9..4e3585c0be884 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -2,10 +2,12 @@ import numpy as np +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_frame = DataFrame( {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, @@ -13,17 +15,18 @@ ) -def test_tab_complete_ipython6_warning(ip): +@async_mark() +async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( """\ - import pandas.util.testing as tm + import pandas._testing as tm s = tm.makeTimeSeries() rs = s.resample("D") """ ) - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 574182ae99c5c..3aa7765954634 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) @@ -89,7 +89,7 @@ def test_fails_on_no_datetime_index(name, func): msg = ( "Only valid with DatetimeIndex, TimedeltaIndex " - "or PeriodIndex, but got an instance of '{}'".format(name) + f"or PeriodIndex, but got an instance of '{name}'" ) with pytest.raises(TypeError, match=msg): df.groupby(Grouper(freq="D")) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 7a6ebf826ca4d..d1bcdc55cb509 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.timedeltas import timedelta_range -import pandas.util.testing as tm def test_asfreq_bug(): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 925eaac45045d..a660acb143433 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data -import pandas.util.testing as tm a_ = np.array @@ -226,9 +226,7 @@ def test_join_on_fails_with_different_right_index(self): {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, index=tm.makeCustomIndex(10, 2), ) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) @@ -240,9 +238,7 @@ def test_join_on_fails_with_different_left_index(self): df2 = DataFrame( {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} ) - msg = ( - r"len\(right_on\) must equal the number of levels in the index" ' of "left"' - ) + msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) @@ -624,7 +620,7 @@ def test_join_mixed_non_unique_index(self): def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( @@ -737,9 +733,7 @@ def test_join_multi_to_multi(self, join_type): ) tm.assert_frame_equal(expected, result) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) @@ -770,6 +764,35 @@ def test_join_on_tz_aware_datetimeindex(self): expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) + def test_join_datetime_string(self): + # GH 5647 + dfa = DataFrame( + [ + ["2012-08-02", "L", 10], + ["2012-08-02", "J", 15], + ["2013-04-06", "L", 20], + ["2013-04-06", "J", 25], + ], + columns=["x", "y", "a"], + ) + dfa["x"] = pd.to_datetime(dfa["x"]) + dfb = DataFrame( + [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], + columns=["x", "y", "z"], + index=[2, 4], + ) + dfb["x"] = pd.to_datetime(dfb["x"]) + result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) + expected = DataFrame( + [ + [pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15], + [pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20], + ], + index=[2, 4], + columns=["x", "y", "z", "a"], + ) + tm.assert_frame_equal(result, expected) + def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 37c0b57bc7581..8e0c4766056d3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -25,10 +25,10 @@ TimedeltaIndex, UInt64Index, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge -import pandas.util.testing as tm N = 50 NGROUPS = 8 @@ -134,6 +134,18 @@ def test_merge_common(self): exp = merge(self.df, self.df2, on=["key1", "key2"]) tm.assert_frame_equal(joined, exp) + def test_merge_non_string_columns(self): + # https://github.com/pandas-dev/pandas/issues/17962 + # Checks that method runs for non string column names + left = pd.DataFrame( + {0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]} + ) + + right = left.astype(float) + expected = left + result = pd.merge(left, right) + tm.assert_frame_equal(expected, result) + def test_merge_index_as_on_arg(self): # GH14355 @@ -732,7 +744,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\]," r" dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -848,7 +860,7 @@ def test_merge_datetime64tz_with_dst_transition(self): def test_merge_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") - df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) + df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.merge(df2, left_index=True, right_index=True, how="inner") expected = DataFrame( diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index e12aad870f1c1..8037095aff0b9 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Timedelta, merge_asof, read_csv, to_datetime +import pandas._testing as tm from pandas.core.reshape.merge import MergeError -import pandas.util.testing as tm class TestAsOfMerge: @@ -1185,6 +1185,13 @@ def test_merge_datatype_categorical_error_raises(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on="a") + def test_merge_groupby_multiple_column_with_categorical_column(self): + # GH 16454 + df = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + result = merge_asof(df, df, on="x", by=["y", "z"]) + expected = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] ) @@ -1303,3 +1310,34 @@ def test_int_type_tolerance(self, any_int_dtype): result = pd.merge_asof(left, right, on="a", tolerance=10) tm.assert_frame_equal(result, expected) + + def test_merge_index_column_tz(self): + # GH 29864 + index = pd.date_range("2019-10-01", freq="30min", periods=5, tz="UTC") + left = pd.DataFrame([0.9, 0.8, 0.7, 0.6], columns=["xyz"], index=index[1:]) + right = pd.DataFrame({"from_date": index, "abc": [2.46] * 4 + [2.19]}) + result = pd.merge_asof( + left=left, right=right, left_index=True, right_on=["from_date"] + ) + expected = pd.DataFrame( + { + "xyz": [0.9, 0.8, 0.7, 0.6], + "from_date": index[1:], + "abc": [2.46] * 3 + [2.19], + }, + index=pd.Index([1, 2, 3, 4]), + ) + tm.assert_frame_equal(result, expected) + + result = pd.merge_asof( + left=right, right=left, right_index=True, left_on=["from_date"] + ) + expected = pd.DataFrame( + { + "from_date": index, + "abc": [2.46] * 4 + [2.19], + "xyz": [np.nan, 0.9, 0.8, 0.7, 0.6], + }, + index=pd.Index([0, 1, 2, 3, 4]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 4e0f570567c07..691f2549c0ece 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 6d6429fb4e6b5..e0063925a03e1 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, merge_ordered -import pandas.util.testing as tm +import pandas._testing as tm class TestMergeOrdered: diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 1d8d2add3840c..1f78c1900d237 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,14 +1,12 @@ -from collections import OrderedDict - import numpy as np from numpy.random import randn import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge -import pandas.util.testing as tm @pytest.fixture @@ -195,6 +193,27 @@ def test_merge_right_vs_left(self, left, right, sort): tm.assert_frame_equal(merged_left_right, merge_right_left) + def test_merge_multiple_cols_with_mixed_cols_index(self): + # GH29522 + s = pd.Series( + range(6), + pd.MultiIndex.from_product([["A", "B"], [1, 2, 3]], names=["lev1", "lev2"]), + name="Amount", + ) + df = pd.DataFrame( + {"lev1": list("AAABBB"), "lev2": [1, 2, 3, 1, 2, 3], "col": 0} + ) + result = pd.merge(df, s.reset_index(), on=["lev1", "lev2"]) + expected = pd.DataFrame( + { + "lev1": list("AAABBB"), + "lev2": [1, 2, 3, 1, 2, 3], + "col": [0] * 6, + "Amount": range(6), + } + ) + tm.assert_frame_equal(result, expected) + def test_compress_group_combinations(self): # ~ 40000000 possible unique groups @@ -453,17 +472,13 @@ def test_merge_datetime_index(self, klass): if klass is not None: on_vector = klass(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -767,17 +782,13 @@ def test_merge_datetime_index(self, box): if box is not None: on_vector = box(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -807,3 +818,22 @@ def test_single_common_level(self): ).set_index(["key", "X", "Y"]) tm.assert_frame_equal(result, expected) + + def test_join_multi_wrong_order(self): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"]) + + left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) + right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) + + result = left.join(right) + + expected = pd.DataFrame( + index=midx1, + data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]}, + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_pivot_old.py b/pandas/tests/reshape/merge/test_pivot_old.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 5c930e01c735d..990669f1ae13a 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,9 +27,10 @@ isna, read_csv, ) -import pandas.core.common as com +import pandas._testing as tm +from pandas.core.arrays import SparseArray +from pandas.core.construction import create_series_with_explicit_dtype from pandas.tests.extension.decimal import to_decimal -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -38,16 +39,6 @@ def sort(request): return request.param -@pytest.fixture(params=[True, False, None]) -def sort_with_none(request): - """Boolean sort keyword for concat and DataFrame.append. - - Includes the default of None - """ - # TODO: Replace with sort once keyword changes. - return request.param - - class TestConcatAppendCommon: """ Test common dtype coercion rules between concat and append. @@ -768,25 +759,6 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) - def test_concat_join_axes_deprecated(self, axis): - # GH21951 - one = pd.DataFrame([[0.0, 1.0], [2.0, 3.0]], columns=list("ab")) - two = pd.DataFrame( - [[10.0, 11.0], [12.0, 13.0]], index=[1, 2], columns=list("bc") - ) - - expected = pd.concat([one, two], axis=1, sort=False).reindex(index=two.index) - with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], axis=1, sort=False, join_axes=[two.index]) - tm.assert_frame_equal(result, expected) - - expected = pd.concat([one, two], axis=0, sort=False).reindex( - columns=two.columns - ) - with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], axis=0, sort=False, join_axes=[two.columns]) - tm.assert_frame_equal(result, expected) - class TestAppend: def test_append(self, sort, float_frame): @@ -876,27 +848,19 @@ def test_append_records(self): tm.assert_frame_equal(result, expected) # rewrite sort fixture, since we also want to test default of None - def test_append_sorts(self, sort_with_none): + def test_append_sorts(self, sort): df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) - if sort_with_none is None: - # only warn if not explicitly specified - # don't check stacklevel since its set for concat, and append - # has an extra stack. - ctx = tm.assert_produces_warning(FutureWarning, check_stacklevel=False) - else: - ctx = tm.assert_produces_warning(None) - - with ctx: - result = df1.append(df2, sort=sort_with_none) + with tm.assert_produces_warning(None): + result = df1.append(df2, sort=sort) # for None / True expected = pd.DataFrame( {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, columns=["a", "b", "c"], ) - if sort_with_none is False: + if sort is False: expected = expected[["b", "a", "c"]] tm.assert_frame_equal(result, expected) @@ -968,7 +932,7 @@ def test_append_preserve_index_name(self): all_indexes = indexes_can_append + indexes_cannot_append_with_other - @pytest.mark.parametrize("index", all_indexes, ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__) def test_append_same_columns_type(self, index): # GH18359 @@ -998,7 +962,7 @@ def test_append_same_columns_type(self, index): @pytest.mark.parametrize( "df_columns, series_index", combinations(indexes_can_append, r=2), - ids=lambda x: x.__class__.__name__, + ids=lambda x: type(x).__name__, ) def test_append_different_columns_types(self, df_columns, series_index): # GH18359 @@ -1023,12 +987,12 @@ def test_append_different_columns_types(self, df_columns, series_index): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "index_can_append", indexes_can_append, ids=lambda x: x.__class__.__name__ + "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__ ) @pytest.mark.parametrize( "index_cannot_append_with_other", indexes_cannot_append_with_other, - ids=lambda x: x.__class__.__name__, + ids=lambda x: type(x).__name__, ) def test_append_different_columns_types_raises( self, index_can_append, index_cannot_append_with_other @@ -1264,7 +1228,7 @@ def test_concat_dict(self): "qux": DataFrame(np.random.randn(4, 3)), } - sorted_keys = com.dict_keys_to_ordered_list(frames) + sorted_keys = list(frames.keys()) result = concat(frames) expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) @@ -1879,7 +1843,7 @@ def test_concat_iterables(self): tm.assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: - def __len__(self): + def __len__(self) -> int: return 2 def __getitem__(self, index): @@ -2196,7 +2160,7 @@ def test_concat_period_other_series(self): def test_concat_empty_series(self): # GH 11082 s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name="y") + s2 = pd.Series(name="y", dtype="float64") res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame( {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, @@ -2205,7 +2169,7 @@ def test_concat_empty_series(self): tm.assert_frame_equal(res, exp) s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name="y") + s2 = pd.Series(name="y", dtype="float64") res = pd.concat([s1, s2], axis=0) # name will be reset exp = pd.Series([1, 2, 3]) @@ -2213,7 +2177,7 @@ def test_concat_empty_series(self): # empty Series with no name s1 = pd.Series([1, 2, 3], name="x") - s2 = pd.Series(name=None) + s2 = pd.Series(name=None, dtype="float64") res = pd.concat([s1, s2], axis=1) exp = pd.DataFrame( {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, @@ -2228,7 +2192,9 @@ def test_concat_empty_series_timelike(self, tz, values): # GH 18447 first = Series([], dtype="M8[ns]").dt.tz_localize(tz) - second = Series(values) + dtype = None if values else np.float64 + second = Series(values, dtype=dtype) + expected = DataFrame( { 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), @@ -2588,7 +2554,8 @@ def test_concat_odered_dict(self): @pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 - dims = pdt().ndim + dims = pdt(dtype=object).ndim + dfs = [ pdt(np.array([1], dtype=dt, ndmin=dims)), pdt(np.array([np.nan], dtype=dt, ndmin=dims)), @@ -2598,7 +2565,7 @@ def test_concat_no_unnecessary_upcast(dt, pdt): assert x.values.dtype == dt -@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("pdt", [create_series_with_explicit_dtype, pd.DataFrame]) @pytest.mark.parametrize("dt", np.sctypes["int"]) def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): @@ -2624,13 +2591,14 @@ def test_concat_empty_and_non_empty_frame_regression(): def test_concat_empty_and_non_empty_series_regression(): # GH 18187 regression test s1 = pd.Series([1]) - s2 = pd.Series([]) + s2 = pd.Series([], dtype=object) + expected = s1 result = pd.concat([s1, s2]) tm.assert_series_equal(result, expected) -def test_concat_sorts_columns(sort_with_none): +def test_concat_sorts_columns(sort): # GH-4588 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) @@ -2641,22 +2609,16 @@ def test_concat_sorts_columns(sort_with_none): columns=["a", "b", "c"], ) - if sort_with_none is False: + if sort is False: expected = expected[["b", "a", "c"]] - if sort_with_none is None: - # only warn if not explicitly specified - ctx = tm.assert_produces_warning(FutureWarning) - else: - ctx = tm.assert_produces_warning(None) - # default - with ctx: - result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none) + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], ignore_index=True, sort=sort) tm.assert_frame_equal(result, expected) -def test_concat_sorts_index(sort_with_none): +def test_concat_sorts_index(sort): df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) @@ -2664,22 +2626,16 @@ def test_concat_sorts_index(sort_with_none): expected = pd.DataFrame( {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] ) - if sort_with_none is False: + if sort is False: expected = expected.loc[["c", "a", "b"]] - if sort_with_none is None: - # only warn if not explicitly specified - ctx = tm.assert_produces_warning(FutureWarning) - else: - ctx = tm.assert_produces_warning(None) - # Warn and sort by default - with ctx: - result = pd.concat([df1, df2], axis=1, sort=sort_with_none) + with tm.assert_produces_warning(None): + result = pd.concat([df1, df2], axis=1, sort=sort) tm.assert_frame_equal(result, expected) -def test_concat_inner_sort(sort_with_none): +def test_concat_inner_sort(sort): # https://github.com/pandas-dev/pandas/pull/20613 df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) @@ -2687,12 +2643,10 @@ def test_concat_inner_sort(sort_with_none): with tm.assert_produces_warning(None): # unset sort should *not* warn for inner join # since that never sorted - result = pd.concat( - [df1, df2], sort=sort_with_none, join="inner", ignore_index=True - ) + result = pd.concat([df1, df2], sort=sort, join="inner", ignore_index=True) expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) - if sort_with_none is True: + if sort is True: expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) @@ -2747,6 +2701,22 @@ def test_concat_categorical_tz(): tm.assert_series_equal(result, expected) +def test_concat_categorical_unchanged(): + # GH-12007 + # test fix for when concat on categorical and float + # coerces dtype categorical -> float + df = pd.DataFrame(pd.Series(["a", "b", "c"], dtype="category", name="A")) + ser = pd.Series([0, 1, 2], index=[0, 1, 3], name="B") + result = pd.concat([df, ser], axis=1) + expected = pd.DataFrame( + { + "A": pd.Series(["a", "b", "c", np.nan], dtype="category"), + "B": pd.Series([0, 1, np.nan, 2], dtype="float"), + } + ) + tm.assert_equal(result, expected) + + def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result @@ -2759,5 +2729,24 @@ def test_concat_datetimeindex_freq(): # Non-monotonic index result result = pd.concat([expected[50:], expected[:50]]) expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) - expected.index.freq = None + expected.index._data.freq = None + tm.assert_frame_equal(result, expected) + + +def test_concat_empty_df_object_dtype(): + # GH 9149 + df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = pd.DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + +def test_concat_sparse(): + # GH 23557 + a = pd.Series(SparseArray([0, 1, 2])) + expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( + pd.SparseDtype(np.int64, 0) + ) + result = pd.concat([a, a], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 611c3272c123f..13b6f05ed304a 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -19,9 +19,9 @@ timedelta_range, to_datetime, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT import pandas.core.reshape.tile as tmod -import pandas.util.testing as tm def test_simple(): @@ -603,3 +603,12 @@ def test_cut_bool_coercion_to_int(bins, box, compare): expected = cut(data_expected, bins, duplicates="drop") result = cut(data_result, bins, duplicates="drop") compare(result, expected) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_cut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + cut(values, 4, labels=labels) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index b1d790644bbfb..814325844cb4c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, lreshape, melt, wide_to_long -import pandas.util.testing as tm +import pandas._testing as tm class TestMelt: @@ -270,6 +270,16 @@ def test_pandas_dtypes(self, col): expected.columns = ["klass", "col", "attribute", "value"] tm.assert_frame_equal(result, expected) + def test_preserve_category(self): + # GH 15853 + data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])}) + result = pd.melt(data, ["B"], ["A"]) + expected = DataFrame( + {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]} + ) + + tm.assert_frame_equal(result, expected) + def test_melt_missing_columns_raises(self): # GH-23575 # This test is to ensure that pandas raises an error if melting is @@ -307,6 +317,22 @@ def test_melt_missing_columns_raises(self): ): multi.melt(["A"], ["F"], col_level=0) + def test_melt_mixed_int_str_id_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) + result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"]) + expected = DataFrame( + {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} + ) + tm.assert_frame_equal(result, expected) + + def test_melt_mixed_int_str_value_vars(self): + # GH 29718 + df = DataFrame({0: ["foo"], "a": ["bar"]}) + result = melt(df, value_vars=[0, "a"]) + expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): @@ -527,6 +553,9 @@ def test_pairs(self): exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) + with tm.assert_produces_warning(FutureWarning): + result = lreshape(df, spec, dropna=False, label="foo") + spec = { "visitdt": ["visitdt{i:d}".format(i=i) for i in range(1, 3)], "wt": ["wt{i:d}".format(i=i) for i in range(1, 4)], diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a8386d21ba27f..743fc50c87e96 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import date, datetime, timedelta from itertools import product @@ -16,9 +15,9 @@ concat, date_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import crosstab, pivot_table -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -581,23 +580,23 @@ def test_pivot_tz_in_values(self): df = pd.DataFrame( [ { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), }, { - "uid": u"aa", + "uid": "aa", "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), }, ] @@ -1044,7 +1043,7 @@ def test_pivot_columns_lexsorted(self): assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): - f = OrderedDict([("D", ["std"]), ("E", ["sum"])]) + f = {"D": ["std"], "E": ["sum"]} expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") result = self.data.pivot_table(index="A", columns="B", aggfunc=f) @@ -1656,6 +1655,24 @@ def test_categorical_margins_category(self, observed): table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) + def test_margins_casted_to_float(self, observed): + # GH 24893 + df = pd.DataFrame( + { + "A": [2, 4, 6, 8], + "B": [1, 4, 5, 8], + "C": [1, 3, 4, 6], + "D": ["X", "X", "Y", "Y"], + } + ) + + result = pd.pivot_table(df, index="D", margins=True) + expected = pd.DataFrame( + {"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]}, + index=pd.Index(["X", "Y", "All"], name="D"), + ) + tm.assert_frame_equal(result, expected) + def test_pivot_with_categorical(self, observed, ordered_fixture): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] @@ -1948,6 +1965,31 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_table_empty_aggfunc(self): + # GH 9186 + df = pd.DataFrame( + { + "A": [2, 2, 3, 3, 2], + "id": [5, 6, 7, 8, 9], + "C": ["p", "q", "q", "p", "q"], + "D": [None, None, None, None, None], + } + ) + result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_pivot_table_no_column_raises(self): + # GH 10326 + def agg(l): + return np.mean(l) + + foo = pd.DataFrame( + {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} + ) + with pytest.raises(KeyError, match="notpresent"): + foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + class TestCrosstab: def setup_method(self, method): diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index eca9b11bd4364..95406a5ebf4f7 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -18,9 +18,9 @@ qcut, timedelta_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.algorithms import quantile -import pandas.util.testing as tm from pandas.tseries.offsets import Day, Nano @@ -130,6 +130,38 @@ def test_qcut_return_intervals(): tm.assert_series_equal(res, exp) +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_qcut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(labels): + # GH 13318 + values = range(10) + msg = "Bin labels must be one fewer than the number of bin edges" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize( + "labels, expected", + [ + (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), + (list(range(3)), Categorical([0, 1, 2], ordered=True)), + ], +) +def test_qcut_list_like_labels(labels, expected): + # GH 13318 + values = range(3) + result = qcut(values, 3, labels=labels) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize( "kwargs,msg", [ diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2e94eeba1d05b..f25291f4aef12 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, get_dummies +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm class TestGetDummies: @@ -45,7 +45,7 @@ def test_basic(self, sparse, dtype): dtype=self.effective_dtype(dtype), ) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0.0) + expected = expected.apply(SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) @@ -132,7 +132,7 @@ def test_include_na(self, sparse, dtype): {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0.0) + exp = exp.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -145,7 +145,7 @@ def test_include_na(self, sparse, dtype): # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) + exp_na = exp_na.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) @@ -167,7 +167,7 @@ def test_unicode(self, sparse): dtype=np.uint8, ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): @@ -180,10 +180,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): if sparse: expected = pd.DataFrame( { - "A_a": pd.SparseArray([1, 0, 1], dtype="uint8"), - "A_b": pd.SparseArray([0, 1, 0], dtype="uint8"), - "B_b": pd.SparseArray([1, 1, 0], dtype="uint8"), - "B_c": pd.SparseArray([0, 0, 1], dtype="uint8"), + "A_a": SparseArray([1, 0, 1], dtype="uint8"), + "A_b": SparseArray([0, 1, 0], dtype="uint8"), + "B_b": SparseArray([1, 1, 0], dtype="uint8"), + "B_c": SparseArray([0, 0, 1], dtype="uint8"), } ) @@ -226,7 +226,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected = expected[["C"] + cols] - typ = pd.SparseArray if sparse else pd.Series + typ = SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) tm.assert_frame_equal(result, expected) @@ -423,7 +423,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) @@ -457,7 +457,7 @@ def test_basic_drop_first_NA(self, sparse): res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) @@ -466,7 +466,7 @@ def test_basic_drop_first_NA(self, sparse): ["b", np.nan], axis=1 ) if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0) + exp_na = exp_na.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( @@ -480,7 +480,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): @@ -494,7 +494,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self, df, sparse): @@ -516,7 +516,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected.sort_index(axis=1) if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) @@ -645,24 +645,3 @@ def test_reshaping_multi_index_categorical(self): index=dti.rename("major"), ) tm.assert_frame_equal(result, expected) - - -class TestMakeAxisDummies: - def test_preserve_categorical_dtype(self): - # GH13854 - for ordered in [False, True]: - cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) - midx = pd.MultiIndex(levels=[["a"], cidx], codes=[[0, 0], [0, 1]]) - df = DataFrame([[10, 11]], index=midx) - - expected = DataFrame( - [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx - ) - - from pandas.core.reshape.reshape import make_axis_dummies - - result = make_axis_dummies(df) - tm.assert_frame_equal(result, expected) - - result = make_axis_dummies(df, transform=lambda x: x) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 9b56c4df6d7de..a503173bd74b1 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestUnionCategoricals: diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 60c6d7ec3017b..cd518dda4edbf 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -2,8 +2,8 @@ import pytest from pandas import Index, date_range +import pandas._testing as tm from pandas.core.reshape.util import cartesian_product -import pandas.util.testing as tm class TestCartesianProduct: diff --git a/pandas/tests/scalar/interval/test_ops.py b/pandas/tests/scalar/interval/test_ops.py index f560c42617260..2d9f0954af5a8 100644 --- a/pandas/tests/scalar/interval/test_ops.py +++ b/pandas/tests/scalar/interval/test_ops.py @@ -59,8 +59,6 @@ def test_overlaps_endpoint(self, start_shift, closed, other_closed): ) def test_overlaps_invalid_type(self, other): interval = Interval(0, 1) - msg = "`other` must be an Interval, got {other}".format( - other=type(other).__name__ - ) + msg = f"`other` must be an Interval, got {type(other).__name__}" with pytest.raises(TypeError, match=msg): interval.overlaps(other) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index bbd97291fab3f..6af9c9884589c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1,5 +1,7 @@ from datetime import date, datetime, timedelta +from distutils.version import StrictVersion +import dateutil import numpy as np import pytest import pytz @@ -10,12 +12,11 @@ from pandas._libs.tslibs.parsing import DateParseError from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz -from pandas.compat import PY35 from pandas.compat.numpy import np_datetime64_compat import pandas as pd from pandas import NaT, Period, Timedelta, Timestamp, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodConstruction: @@ -1043,6 +1044,7 @@ def test_add_sub_nat(self): assert NaT - p is NaT p = Period("NaT", freq="M") + assert p is NaT assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT @@ -1283,6 +1285,7 @@ def test_add_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p + o is NaT assert o + p is NaT @@ -1299,6 +1302,7 @@ def test_add_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT assert o + p is NaT @@ -1316,6 +1320,7 @@ def test_add_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1339,6 +1344,7 @@ def test_add_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1438,6 +1444,7 @@ def test_sub_offset_nat(self): # freq is DateOffset for freq in ["A", "2A", "3A"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.YearEnd(2)]: assert p - o is NaT @@ -1452,6 +1459,7 @@ def test_sub_offset_nat(self): for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p - o is NaT @@ -1467,6 +1475,7 @@ def test_sub_offset_nat(self): # freq is Tick for freq in ["D", "2D", "3D"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(5), offsets.Hour(24), @@ -1488,6 +1497,7 @@ def test_sub_offset_nat(self): for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) + assert p is NaT for o in [ offsets.Day(2), offsets.Hour(3), @@ -1510,6 +1520,7 @@ def test_sub_offset_nat(self): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_nat_ops(self, freq): p = Period("NaT", freq=freq) + assert p is NaT assert p + 1 is NaT assert 1 + p is NaT assert p - 1 is NaT @@ -1546,10 +1557,8 @@ def test_period_immutable(): @pytest.mark.xfail( - # xpassing on MacPython with strict=False - # https://travis-ci.org/MacPython/pandas-wheels/jobs/574706922 - PY35, - reason="Parsing as Period('0007-01-01', 'D') for reasons unknown", + StrictVersion(dateutil.__version__.split(".dev")[0]) < StrictVersion("2.7.0"), + reason="Bug in dateutil < 2.7.0 when parsing old dates: Period('0001-01-07', 'D')", strict=False, ) def test_small_year_parsing(): diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py new file mode 100644 index 0000000000000..7d05511239ebc --- /dev/null +++ b/pandas/tests/scalar/test_na_scalar.py @@ -0,0 +1,266 @@ +import numpy as np +import pytest + +from pandas._libs.missing import NA + +from pandas.core.dtypes.common import is_scalar + +import pandas as pd +import pandas._testing as tm + + +def test_singleton(): + assert NA is NA + new_NA = type(NA)() + assert new_NA is NA + + +def test_repr(): + assert repr(NA) == "" + assert str(NA) == "" + + +def test_truthiness(): + with pytest.raises(TypeError): + bool(NA) + + with pytest.raises(TypeError): + not NA + + +def test_hashable(): + assert hash(NA) == hash(NA) + d = {NA: "test"} + assert d[NA] == "test" + + +def test_arithmetic_ops(all_arithmetic_functions): + op = all_arithmetic_functions + + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]: + if op.__name__ in ("pow", "rpow", "rmod") and isinstance(other, str): + continue + if op.__name__ in ("divmod", "rdivmod"): + assert op(NA, other) is (NA, NA) + else: + if op.__name__ == "rpow": + # avoid special case + other += 1 + assert op(NA, other) is NA + + +def test_comparison_ops(): + + for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]: + assert (NA == other) is NA + assert (NA != other) is NA + assert (NA > other) is NA + assert (NA >= other) is NA + assert (NA < other) is NA + assert (NA <= other) is NA + assert (other == NA) is NA + assert (other != NA) is NA + assert (other > NA) is NA + assert (other >= NA) is NA + assert (other < NA) is NA + assert (other <= NA) is NA + + +@pytest.mark.parametrize( + "value", + [ + 0, + 0.0, + -0, + -0.0, + False, + np.bool_(False), + np.int_(0), + np.float_(0), + np.int_(-0), + np.float_(-0), + ], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_pow_special(value, asarray): + if asarray: + value = np.array([value]) + result = pd.NA ** value + + if asarray: + result = result[0] + else: + # this assertion isn't possible for ndarray. + assert isinstance(result, type(value)) + assert result == 1 + + +@pytest.mark.parametrize( + "value", + [ + 1, + 1.0, + -1, + -1.0, + True, + np.bool_(True), + np.int_(1), + np.float_(1), + np.int_(-1), + np.float_(-1), + ], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_special(value, asarray): + if asarray: + value = np.array([value]) + result = value ** pd.NA + + if asarray: + result = result[0] + elif not isinstance(value, (np.float_, np.bool_, np.int_)): + # this assertion isn't possible with asarray=True + assert isinstance(result, type(value)) + + assert result == value + + +def test_unary_ops(): + assert +NA is NA + assert -NA is NA + assert abs(NA) is NA + assert ~NA is NA + + +def test_logical_and(): + + assert NA & True is NA + assert True & NA is NA + assert NA & False is False + assert False & NA is False + assert NA & NA is NA + + with pytest.raises(TypeError): + NA & 5 + + +def test_logical_or(): + + assert NA | True is True + assert True | NA is True + assert NA | False is NA + assert False | NA is NA + assert NA | NA is NA + + with pytest.raises(TypeError): + NA | 5 + + +def test_logical_xor(): + + assert NA ^ True is NA + assert True ^ NA is NA + assert NA ^ False is NA + assert False ^ NA is NA + assert NA ^ NA is NA + + with pytest.raises(TypeError): + NA ^ 5 + + +def test_logical_not(): + assert ~NA is NA + + +@pytest.mark.parametrize( + "shape", [(3,), (3, 3), (1, 2, 3)], +) +def test_arithmetic_ndarray(shape, all_arithmetic_functions): + op = all_arithmetic_functions + a = np.zeros(shape) + if op.__name__ == "pow": + a += 5 + result = op(pd.NA, a) + expected = np.full(a.shape, pd.NA, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + +def test_is_scalar(): + assert is_scalar(NA) is True + + +def test_isna(): + assert pd.isna(NA) is True + assert pd.notna(NA) is False + + +def test_series_isna(): + s = pd.Series([1, NA], dtype=object) + expected = pd.Series([False, True]) + tm.assert_series_equal(s.isna(), expected) + + +def test_ufunc(): + assert np.log(pd.NA) is pd.NA + assert np.add(pd.NA, 1) is pd.NA + result = np.divmod(pd.NA, 1) + assert result[0] is pd.NA and result[1] is pd.NA + + result = np.frexp(pd.NA) + assert result[0] is pd.NA and result[1] is pd.NA + + +def test_ufunc_raises(): + with pytest.raises(ValueError, match="ufunc method 'at'"): + np.log.at(pd.NA, 0) + + +def test_binary_input_not_dunder(): + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + result = np.logaddexp(a, pd.NA) + tm.assert_numpy_array_equal(result, expected) + + result = np.logaddexp(pd.NA, a) + tm.assert_numpy_array_equal(result, expected) + + # all NA, multiple inputs + assert np.logaddexp(pd.NA, pd.NA) is pd.NA + + result = np.modf(pd.NA, pd.NA) + assert len(result) == 2 + assert all(x is pd.NA for x in result) + + +def test_divmod_ufunc(): + # binary in, binary out. + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + + result = np.divmod(a, pd.NA) + assert isinstance(result, tuple) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + result = np.divmod(pd.NA, a) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + +def test_integer_hash_collision_dict(): + # GH 30013 + result = {NA: "foo", hash(NA): "bar"} + + assert result[NA] == "foo" + assert result[hash(NA)] == "bar" + + +def test_integer_hash_collision_set(): + # GH 30013 + result = {NA, hash(NA)} + + assert len(result) == 2 + assert NA in result + assert hash(NA) in result diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 79608f4fb3cde..a537f000959e3 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -21,9 +21,9 @@ Timestamp, isna, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.ops import roperator -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -123,6 +123,13 @@ def test_round_nat(klass, method, freq): "dst", "fromordinal", "fromtimestamp", + pytest.param( + "fromisocalendar", + marks=pytest.mark.skipif( + not compat.PY38, + reason="'fromisocalendar' was added in stdlib datetime in python 3.8", + ), + ), "isocalendar", "strftime", "strptime", @@ -141,7 +148,7 @@ def test_round_nat(klass, method, freq): ) def test_nat_methods_raise(method): # see gh-9513, gh-17329 - msg = "NaTType does not support {method}".format(method=method) + msg = f"NaTType does not support {method}" with pytest.raises(ValueError, match=msg): getattr(NaT, method)() @@ -297,6 +304,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + + # "fromisocalendar" was introduced in 3.8 if klass is Timestamp and not compat.PY38: expected.remove("fromisocalendar") diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 2ba55b22a7c54..3764d9b7548fc 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import NaT, Timedelta, Timestamp, offsets +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm class TestTimedeltaAdditionSubtraction: @@ -241,6 +241,57 @@ def test_td_add_mixed_timedeltalike_object_dtype_array(self, op): res = op(arr, Timedelta("1D")) tm.assert_numpy_array_equal(res, exp) + # TODO: moved from index tests following #24365, may need de-duplication + def test_ops_ndarray(self): + td = Timedelta("1 day") + + # timedelta, timedelta + other = pd.to_timedelta(["1 day"]).values + expected = pd.to_timedelta(["2 days"]).values + tm.assert_numpy_array_equal(td + other, expected) + tm.assert_numpy_array_equal(other + td, expected) + msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td + np.array([1]) + msg = r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'" + with pytest.raises(TypeError, match=msg): + np.array([1]) + td + + expected = pd.to_timedelta(["0 days"]).values + tm.assert_numpy_array_equal(td - other, expected) + tm.assert_numpy_array_equal(-other + td, expected) + msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" + with pytest.raises(TypeError, match=msg): + td - np.array([1]) + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and 'Timedelta'" + with pytest.raises(TypeError, match=msg): + np.array([1]) - td + + expected = pd.to_timedelta(["2 days"]).values + tm.assert_numpy_array_equal(td * np.array([2]), expected) + tm.assert_numpy_array_equal(np.array([2]) * td, expected) + msg = ( + "ufunc '?multiply'? cannot use operands with types" + r" dtype\(' bool: + return isinstance(other, Inf) inf = Inf() timestamp = Timestamp("2018-11-30") diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 652dd34ca7ce2..f1fcf46a936fd 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -14,12 +14,13 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import dateutil_gettz as gettz, get_timezone +import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat from pandas.errors import OutOfBoundsDatetime import pandas.util._test_decorators as td from pandas import NaT, Period, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries import offsets @@ -108,9 +109,7 @@ def check(value, equal): ) def test_names(self, data, time_locale): # GH 17354 - # Test .weekday_name, .day_name(), .month_name - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert data.weekday_name == "Monday" + # Test .day_name(), .month_name if time_locale is None: expected_day = "Monday" expected_month = "August" @@ -194,24 +193,26 @@ def test_resolution(self): dt = Timestamp("2100-01-01 00:00:00") assert dt.resolution == Timedelta(nanoseconds=1) + # Check that the attribute is available on the class, mirroring + # the stdlib datetime behavior + assert Timestamp.resolution == Timedelta(nanoseconds=1) + class TestTimestampConstructors: def test_constructor(self): base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) - base_expected = 1404205200000000000 + base_expected = 1_404_205_200_000_000_000 # confirm base representation is correct - import calendar - - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected tests = [ (base_str, base_dt, base_expected), ( "2014-07-01 10:00", datetime(2014, 7, 1, 10), - base_expected + 3600 * 1000000000, + base_expected + 3600 * 1_000_000_000, ), ( "2014-07-01 09:00:00.000008000", @@ -250,7 +251,7 @@ def test_constructor(self): # with timezone for tz, offset in timezones: for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: - expected_tz = expected - offset * 3600 * 1000000000 + expected_tz = expected - offset * 3600 * 1_000_000_000 assert result.value == expected_tz assert conversion.pydt_to_i8(result) == expected_tz @@ -264,7 +265,7 @@ def test_constructor(self): result = Timestamp(result).tz_convert("UTC") else: result = Timestamp(result, tz="UTC") - expected_utc = expected - offset * 3600 * 1000000000 + expected_utc = expected - offset * 3600 * 1_000_000_000 assert result.value == expected_utc assert conversion.pydt_to_i8(result) == expected_utc @@ -272,16 +273,14 @@ def test_constructor_with_stringoffset(self): # GH 7833 base_str = "2014-07-01 11:00:00+02:00" base_dt = datetime(2014, 7, 1, 9) - base_expected = 1404205200000000000 + base_expected = 1_404_205_200_000_000_000 # confirm base representation is correct - import calendar - - assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + assert calendar.timegm(base_dt.timetuple()) * 1_000_000_000 == base_expected tests = [ (base_str, base_expected), - ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1000000000), + ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1_000_000_000), ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), ] @@ -677,11 +676,13 @@ def test_constructor_invalid_frequency(self): Timestamp("2012-01-01", freq=[]) @pytest.mark.parametrize("box", [datetime, Timestamp]) - def test_depreciate_tz_and_tzinfo_in_datetime_input(self, box): + def test_raise_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc} - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): Timestamp(box(**kwargs), tz="US/Pacific") + with pytest.raises(ValueError, match="Cannot pass a datetime or Timestamp"): + Timestamp(box(**kwargs), tzinfo=pytz.timezone("US/Pacific")) def test_dont_convert_dateutil_utc_to_pytz_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) @@ -700,6 +701,19 @@ class SubDatetime(datetime): expected = Timestamp(2000, 1, 1) assert result == expected + @pytest.mark.skipif( + not compat.PY38, + reason="datetime.fromisocalendar was added in Python version 3.8", + ) + def test_constructor_fromisocalendar(self): + # GH 30395 + expected_timestamp = Timestamp("2000-01-03 00:00:00") + expected_stdlib = datetime.fromisocalendar(2000, 1, 1) + result = Timestamp.fromisocalendar(2000, 1, 1) + assert result == expected_timestamp + assert result == expected_stdlib + assert isinstance(result, Timestamp) + class TestTimestamp: def test_tz(self): @@ -725,7 +739,7 @@ def test_utc_z_designator(self): assert get_timezone(Timestamp("2014-11-02 01:00Z").tzinfo) is utc def test_asm8(self): - np.random.seed(7960929) + np.random.seed(7_960_929) ns = [Timestamp.min.value, Timestamp.max.value, 1000] for n in ns: @@ -786,7 +800,7 @@ def compare(x, y): ) def test_basics_nanos(self): - val = np.int64(946684800000000000).view("M8[ns]") + val = np.int64(946_684_800_000_000_000).view("M8[ns]") stamp = Timestamp(val.view("i8") + 500) assert stamp.year == 2000 assert stamp.month == 1 @@ -794,7 +808,7 @@ def test_basics_nanos(self): assert stamp.nanosecond == 500 # GH 14415 - val = np.iinfo(np.int64).min + 80000000000000 + val = np.iinfo(np.int64).min + 80_000_000_000_000 stamp = Timestamp(val) assert stamp.year == 1677 assert stamp.month == 9 @@ -807,8 +821,8 @@ def test_basics_nanos(self): [ [946688461000000000, {}], [946688461000000000 / 1000, dict(unit="us")], - [946688461000000000 / 1000000, dict(unit="ms")], - [946688461000000000 / 1000000000, dict(unit="s")], + [946688461000000000 / 1_000_000, dict(unit="ms")], + [946688461000000000 / 1_000_000_000, dict(unit="s")], [10957, dict(unit="D", h=0)], [ (946688461000000000 + 500000) / 1000000000, @@ -852,24 +866,24 @@ def test_roundtrip(self): base = Timestamp("20140101 00:00:00") result = Timestamp(base.value + Timedelta("5ms").value) - assert result == Timestamp(str(base) + ".005000") + assert result == Timestamp(f"{base}.005000") assert result.microsecond == 5000 result = Timestamp(base.value + Timedelta("5us").value) - assert result == Timestamp(str(base) + ".000005") + assert result == Timestamp(f"{base}.000005") assert result.microsecond == 5 result = Timestamp(base.value + Timedelta("5ns").value) - assert result == Timestamp(str(base) + ".000000005") + assert result == Timestamp(f"{base}.000000005") assert result.nanosecond == 5 assert result.microsecond == 0 result = Timestamp(base.value + Timedelta("6ms 5us").value) - assert result == Timestamp(str(base) + ".006005") + assert result == Timestamp(f"{base}.006005") assert result.microsecond == 5 + 6 * 1000 result = Timestamp(base.value + Timedelta("200ms 5us").value) - assert result == Timestamp(str(base) + ".200005") + assert result == Timestamp(f"{base}.200005") assert result.microsecond == 5 + 200 * 1000 def test_hash_equivalent(self): @@ -890,12 +904,12 @@ def test_nanosecond_string_parsing(self): ts = Timestamp("2013-05-01 07:15:45.123456789") # GH 7878 expected_repr = "2013-05-01 07:15:45.123456789" - expected_value = 1367392545123456789 + expected_value = 1_367_392_545_123_456_789 assert ts.value == expected_value assert expected_repr in repr(ts) ts = Timestamp("2013-05-01 07:15:45.123456789+09:00", tz="Asia/Tokyo") - assert ts.value == expected_value - 9 * 3600 * 1000000000 + assert ts.value == expected_value - 9 * 3600 * 1_000_000_000 assert expected_repr in repr(ts) ts = Timestamp("2013-05-01 07:15:45.123456789", tz="UTC") @@ -903,7 +917,7 @@ def test_nanosecond_string_parsing(self): assert expected_repr in repr(ts) ts = Timestamp("2013-05-01 07:15:45.123456789", tz="US/Eastern") - assert ts.value == expected_value + 4 * 3600 * 1000000000 + assert ts.value == expected_value + 4 * 3600 * 1_000_000_000 assert expected_repr in repr(ts) # GH 10041 @@ -913,7 +927,7 @@ def test_nanosecond_string_parsing(self): def test_nanosecond_timestamp(self): # GH 7610 - expected = 1293840000000000005 + expected = 1_293_840_000_000_000_005 t = Timestamp("2011-01-01") + offsets.Nano(5) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected @@ -929,7 +943,7 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 5 - expected = 1293840000000000010 + expected = 1_293_840_000_000_000_010 t = t + offsets.Nano(5) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" assert t.value == expected @@ -949,23 +963,23 @@ def test_nanosecond_timestamp(self): class TestTimestampToJulianDate: def test_compare_1700(self): r = Timestamp("1700-06-23").to_julian_date() - assert r == 2342145.5 + assert r == 2_342_145.5 def test_compare_2000(self): r = Timestamp("2000-04-12").to_julian_date() - assert r == 2451646.5 + assert r == 2_451_646.5 def test_compare_2100(self): r = Timestamp("2100-08-12").to_julian_date() - assert r == 2488292.5 + assert r == 2_488_292.5 def test_compare_hour01(self): r = Timestamp("2000-08-12T01:00:00").to_julian_date() - assert r == 2451768.5416666666666666 + assert r == 2_451_768.5416666666666666 def test_compare_hour13(self): r = Timestamp("2000-08-12T13:00:00").to_julian_date() - assert r == 2451769.0416666666666666 + assert r == 2_451_769.0416666666666666 class TestTimestampConversion: diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 424b0c9abdef8..6537f6ccd8432 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -14,7 +14,6 @@ import pandas.util._test_decorators as td from pandas import NaT, Timestamp -import pandas.util.testing as tm class TestTimestampTZOperations: @@ -80,7 +79,6 @@ def test_tz_localize_ambiguous(self): ("2015-03-29 02:30", "Europe/Belgrade"), ], ) - @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) @@ -88,36 +86,21 @@ def test_tz_localize_nonexistent(self, stamp, tz): ts.tz_localize(tz) # GH 22644 with pytest.raises(NonExistentTimeError): - with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors="raise") - with tm.assert_produces_warning(FutureWarning): - assert ts.tz_localize(tz, errors="coerce") is NaT + ts.tz_localize(tz, nonexistent="raise") + assert ts.tz_localize(tz, nonexistent="NaT") is NaT - def test_tz_localize_errors_ambiguous(self): + def test_tz_localize_ambiguous_raise(self): # GH#13057 ts = Timestamp("2015-11-1 01:00") with pytest.raises(AmbiguousTimeError): - with tm.assert_produces_warning(FutureWarning): - ts.tz_localize("US/Pacific", errors="coerce") + ts.tz_localize("US/Pacific", ambiguous="raise") - @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_tz_localize_errors_invalid_arg(self): + def test_tz_localize_nonexistent_invalid_arg(self): # GH 22644 tz = "Europe/Warsaw" ts = Timestamp("2015-03-29 02:00:00") with pytest.raises(ValueError): - with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors="foo") - - def test_tz_localize_errors_coerce(self): - # GH 22644 - # make sure errors='coerce' gets mapped correctly to nonexistent - tz = "Europe/Warsaw" - ts = Timestamp("2015-03-29 02:00:00") - with tm.assert_produces_warning(FutureWarning): - result = ts.tz_localize(tz, errors="coerce") - expected = ts.tz_localize(tz, nonexistent="NaT") - assert result is expected + ts.tz_localize(tz, nonexistent="foo") @pytest.mark.parametrize( "stamp", @@ -306,15 +289,14 @@ def test_astimezone(self, tzstr): @td.skip_if_windows def test_tz_convert_utc_with_system_utc(self): - from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) # from system utc to real utc - ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) + ts = Timestamp("2001-01-05 11:56", tz=timezones.maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index dffb957b8f3b0..65066fd0099ba 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -7,11 +7,10 @@ from pandas._libs.tslibs import conversion from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG -from pandas.compat import PY36 import pandas.util._test_decorators as td from pandas import NaT, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import to_offset @@ -375,7 +374,6 @@ def test_replace_dst_border(self): expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected - @pytest.mark.skipif(not PY36, reason="Fold not available until PY3.6") @pytest.mark.parametrize("fold", [0, 1]) @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) def test_replace_dst_fold(self, fold, tz): diff --git a/pandas/tests/series/conftest.py b/pandas/tests/series/conftest.py index 18d3c87a01f87..ff0b0c71f88b0 100644 --- a/pandas/tests/series/conftest.py +++ b/pandas/tests/series/conftest.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 7509d21b8832f..47f40e24e1637 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, Series, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -230,7 +230,7 @@ def test_reindex_with_datetimes(): def test_reindex_corner(datetime_series): # (don't forget to fix this) I think it's fixed - empty = Series() + empty = Series(dtype=object) empty.reindex(datetime_series.index, method="pad") # it works # corner case: pad empty series @@ -539,8 +539,9 @@ def test_drop_with_ignore_errors(): def test_drop_empty_list(index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] - series = pd.Series(index=index).drop(drop_labels) - tm.assert_series_equal(series, pd.Series(index=expected_index)) + series = pd.Series(index=index, dtype=object).drop(drop_labels) + expected = pd.Series(index=expected_index, dtype=object) + tm.assert_series_equal(series, expected) @pytest.mark.parametrize( @@ -554,4 +555,5 @@ def test_drop_empty_list(index, drop_labels): def test_drop_non_empty_list(data, index, drop_labels): # GH 21494 and GH 16877 with pytest.raises(KeyError, match="not found in axis"): - pd.Series(data=data, index=index).drop(drop_labels) + dtype = object if data is None else None + pd.Series(data=data, index=index, dtype=dtype).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index c2912cf3ce53f..d75efcf52c271 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Index, Series, Timestamp, date_range, isna +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -75,7 +75,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot index with vector containing NA / NaN values" + msg = "cannot mask with array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg): @@ -285,8 +285,8 @@ def test_where_error(): with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] msg = ( - "NumPy boolean array indexing assignment cannot assign 0 input" - " values to the 1 output values where the mask is true" + "NumPy boolean array indexing assignment cannot assign 0 input " + "values to the 1 output values where the mask is true" ) with pytest.raises(ValueError, match=msg): s[[True, False]] = [] diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py index 2d879eed967e5..fe575cf146641 100644 --- a/pandas/tests/series/indexing/test_callable.py +++ b/pandas/tests/series/indexing/test_callable.py @@ -1,5 +1,5 @@ import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_getitem_callable(): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index fab3310fa3dfe..15ff5f6b343d1 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm """ @@ -105,7 +105,7 @@ def test_series_set_value(): dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] index = DatetimeIndex(dates) - s = Series()._set_value(dates[0], 1.0) + s = Series(dtype=object)._set_value(dates[0], 1.0) s2 = s._set_value(dates[1], np.nan) expected = Series([1.0, np.nan], index=index) diff --git a/pandas/tests/series/indexing/test_iloc.py b/pandas/tests/series/indexing/test_iloc.py index eef4d89af3832..f276eb5b0b23d 100644 --- a/pandas/tests/series/indexing/test_iloc.py +++ b/pandas/tests/series/indexing/test_iloc.py @@ -1,7 +1,7 @@ import numpy as np from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm def test_iloc(): diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 4673dabca811b..4601cabf69b52 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay @@ -52,15 +52,11 @@ def test_basic_getitem_with_labels(datetime_series): s = Series(np.random.randn(10), index=list(range(0, 20, 2))) inds = [0, 2, 5, 7, 8] arr_inds = np.array([0, 2, 5, 7, 8]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s[inds] - expected = s.reindex(inds) - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s[inds] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s[arr_inds] - expected = s.reindex(arr_inds) - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s[arr_inds] # GH12089 # with tz for values @@ -109,7 +105,9 @@ def test_getitem_get(datetime_series, string_series, object_series): # None # GH 5652 - for s in [Series(), Series(index=list("abc"))]: + s1 = Series(dtype=object) + s2 = Series(dtype=object, index=list("abc")) + for s in [s1, s2]: result = s.get(None) assert result is None @@ -134,7 +132,7 @@ def test_getitem_generator(string_series): def test_type_promotion(): # GH12599 - s = pd.Series() + s = pd.Series(dtype=object) s["a"] = pd.Timestamp("2016-01-01") s["b"] = 3.0 s["c"] = "foo" @@ -172,7 +170,7 @@ def test_getitem_out_of_bounds(datetime_series): datetime_series[len(datetime_series)] # GH #917 - s = Series([]) + s = Series([], dtype=object) with pytest.raises(IndexError, match=msg): s[-1] @@ -262,12 +260,11 @@ def test_getitem_dups_with_missing(): # breaks reindex, so need to use .loc internally # GH 4246 s = Series([1, 2, 3, 4], ["foo", "bar", "foo", "bah"]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = s.loc[["foo", "bar", "bah", "bam"]] + with pytest.raises(KeyError, match="with any missing labels"): + s.loc[["foo", "bar", "bah", "bam"]] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = s[["foo", "bar", "bah", "bam"]] - tm.assert_series_equal(result, expected) + with pytest.raises(KeyError, match="with any missing labels"): + s[["foo", "bar", "bah", "bam"]] def test_getitem_dups(): @@ -297,8 +294,8 @@ def test_getitem_dataframe(): s = pd.Series(10, index=rng) df = pd.DataFrame(rng, index=rng) msg = ( - "Indexing a Series with DataFrame is not supported," - " use the appropriate DataFrame column" + "Indexing a Series with DataFrame is not supported, " + "use the appropriate DataFrame column" ) with pytest.raises(TypeError, match=msg): s[df > 5] @@ -329,12 +326,12 @@ def test_setitem(datetime_series, string_series): # Test for issue #10193 key = pd.Timestamp("2012-01-01") - series = pd.Series() + series = pd.Series(dtype=object) series[key] = 47 expected = pd.Series(47, [key]) tm.assert_series_equal(series, expected) - series = pd.Series([], pd.DatetimeIndex([], freq="D")) + series = pd.Series([], pd.DatetimeIndex([], freq="D"), dtype=object) series[key] = 47 expected = pd.Series(47, pd.DatetimeIndex([key], freq="D")) tm.assert_series_equal(series, expected) @@ -391,6 +388,22 @@ def test_setslice(datetime_series): assert sl.index.is_unique is True +def test_2d_to_1d_assignment_raises(): + x = np.random.randn(2, 2) + y = pd.Series(range(2)) + + msg = ( + r"shape mismatch: value array of shape \(2,2\) could not be" + r" broadcast to indexing result of shape \(2,\)" + ) + with pytest.raises(ValueError, match=msg): + y.loc[range(2)] = x + + msg = r"could not broadcast input array from shape \(2,2\) into shape \(2\)" + with pytest.raises(ValueError, match=msg): + y.loc[:] = x + + # FutureWarning from NumPy about [slice(None, 5). @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") def test_basic_getitem_setitem_corner(datetime_series): @@ -626,7 +639,7 @@ def test_setitem_na(): def test_timedelta_assignment(): # GH 8209 - s = Series([]) + s = Series([], dtype=object) s.loc["B"] = timedelta(1) tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index e6b5b5df2b000..7d6b6c78cc492 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index bcddcf843df06..ce0d04ff99077 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_get(): @@ -86,8 +86,7 @@ def test_get(): 1764.0, 1849.0, 1936.0, - ], - dtype="object", + ] ), ) @@ -124,12 +123,10 @@ def test_get_nan_multiple(): s = pd.Float64Index(range(10)).to_series() idx = [2, 30] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) + assert s.get(idx) is None idx = [2, np.nan] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) + assert s.get(idx) is None # GH 17295 - all missing keys idx = [20, 30] @@ -153,7 +150,7 @@ def test_delitem(): tm.assert_series_equal(s, expected) # empty - s = Series() + s = Series(dtype=object) with pytest.raises(KeyError, match=r"^0$"): del s[0] diff --git a/pandas/tests/series/methods/__init__.py b/pandas/tests/series/methods/__init__.py new file mode 100644 index 0000000000000..bcb0d30f405e2 --- /dev/null +++ b/pandas/tests/series/methods/__init__.py @@ -0,0 +1,7 @@ +""" +Test files dedicated to individual (stand-alone) Series methods + +Ideally these files/tests should correspond 1-to-1 with tests.frame.methods + +These may also present opportunities for sharing/de-duplicating test code. +""" diff --git a/pandas/tests/series/methods/test_append.py b/pandas/tests/series/methods/test_append.py new file mode 100644 index 0000000000000..dc0fca4bba067 --- /dev/null +++ b/pandas/tests/series/methods/test_append.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, date_range +import pandas._testing as tm + + +class TestSeriesAppend: + def test_append(self, datetime_series, string_series, object_series): + appended_series = string_series.append(object_series) + for idx, value in appended_series.items(): + if idx in string_series.index: + assert value == string_series[idx] + elif idx in object_series.index: + assert value == object_series[idx] + else: + raise AssertionError("orphaned index!") + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + datetime_series.append(datetime_series, verify_integrity=True) + + def test_append_many(self, datetime_series): + pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] + + result = pieces[0].append(pieces[1:]) + tm.assert_series_equal(result, datetime_series) + + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal( + s1.append(s2, ignore_index=True), exp, check_index_type=True + ) + tm.assert_series_equal( + pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True + ) + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + s1.append(s2, verify_integrity=True) + with pytest.raises(ValueError, match=msg): + pd.concat([s1, s2], verify_integrity=True) + + def test_append_tuples(self): + # GH 28410 + s = pd.Series([1, 2, 3]) + list_input = [s, s] + tuple_input = (s, s) + + expected = s.append(list_input) + result = s.append(tuple_input) + + tm.assert_series_equal(expected, result) + + +class TestSeriesAppendWithDatetimeIndex: + def test_append(self): + rng = date_range("5/8/2012 1:45", periods=10, freq="5T") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) + + appended = rng.append(rng) + tm.assert_index_equal(appended, ex_index) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + tm.assert_index_equal(appended, ex_index) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = "foo" + rng2.name = "bar" + assert rng1.append(rng1).name == "foo" + assert rng1.append(rng2).name is None + + def test_append_tz(self): + # see gh-2938 + rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") + rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") + rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_explicit_pytz(self): + # see gh-2938 + from pytz import timezone as timezone + + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_dateutil(self): + # see gh-2938 + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py new file mode 100644 index 0000000000000..1fc98ded0d3d2 --- /dev/null +++ b/pandas/tests/series/methods/test_argsort.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp, isna +import pandas._testing as tm + + +class TestSeriesArgsort: + def _check_accum_op(self, name, ser, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + ) + + # with missing values + ts = ser.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + def test_argsort(self, datetime_series): + self._check_accum_op("argsort", datetime_series, check_dtype=False) + argsorted = datetime_series.argsort() + assert issubclass(argsorted.dtype.type, np.integer) + + # GH#2967 (introduced bug in 0.11-dev I think) + s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) + assert s.dtype == "datetime64[ns]" + shifted = s.shift(-1) + assert shifted.dtype == "datetime64[ns]" + assert isna(shifted[4]) + + result = s.argsort() + expected = Series(range(5), dtype="int64") + tm.assert_series_equal(result, expected) + + result = shifted.argsort() + expected = Series(list(range(4)) + [-1], dtype="int64") + tm.assert_series_equal(result, expected) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind="mergesort") + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") + + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type ," + r" found instead" + ) + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(qindexer, mindexer) + + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/methods/test_asof.py similarity index 99% rename from pandas/tests/series/test_asof.py rename to pandas/tests/series/methods/test_asof.py index 8bc9e9c38d83a..b121efd202744 100644 --- a/pandas/tests/series/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, Timestamp, date_range, isna, notna, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAsof: diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py new file mode 100644 index 0000000000000..37764d3b82c2d --- /dev/null +++ b/pandas/tests/series/methods/test_clip.py @@ -0,0 +1,99 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, Timestamp, isna, notna +import pandas._testing as tm + + +class TestSeriesClip: + def test_clip(self, datetime_series): + val = datetime_series.median() + + assert datetime_series.clip(lower=val).min() == val + assert datetime_series.clip(upper=val).max() == val + + result = datetime_series.clip(-0.5, 0.5) + expected = np.clip(datetime_series, -0.5, 0.5) + tm.assert_series_equal(result, expected) + assert isinstance(expected, Series) + + def test_clip_types_and_nulls(self): + + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] + + for s in sers: + thresh = s[2] + lower = s.clip(lower=thresh) + upper = s.clip(upper=thresh) + assert lower[notna(lower)].min() == thresh + assert upper[notna(upper)].max() == thresh + assert list(isna(s)) == list(isna(lower)) + assert list(isna(s)) == list(isna(upper)) + + def test_clip_with_na_args(self): + """Should process np.nan argument as None """ + # GH#17276 + s = Series([1, 2, 3]) + + tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) + tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + + # GH#19992 + tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) + tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) + + def test_clip_against_series(self): + # GH#6966 + + s = Series([1.0, 1.0, 4.0]) + + lower = Series([1.0, 2.0, 3.0]) + upper = Series([1.5, 2.5, 3.5]) + + tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) + tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) + def test_clip_against_list_like(self, inplace, upper): + # GH#15390 + original = pd.Series([5, 6, 7]) + result = original.clip(upper=upper, inplace=inplace) + expected = pd.Series([1, 2, 3]) + + if inplace: + result = original + tm.assert_series_equal(result, expected, check_exact=True) + + def test_clip_with_datetimes(self): + # GH#11838 + # naive and tz-aware datetimes + + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) + result = s.clip(upper=t) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) + tm.assert_series_equal(result, expected) + + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) + result = s.clip(upper=t) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py new file mode 100644 index 0000000000000..1ca48eeb7c441 --- /dev/null +++ b/pandas/tests/series/methods/test_count.py @@ -0,0 +1,38 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesCount: + def test_count(self, datetime_series): + assert datetime_series.count() == len(datetime_series) + + datetime_series[::2] = np.NaN + + assert datetime_series.count() == np.isfinite(datetime_series).sum() + + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) + ts = Series(np.arange(len(mi)), index=mi) + + left = ts.count(level=1) + right = Series([2, 3, 1], index=[1, 2, np.nan]) + tm.assert_series_equal(left, right) + + ts.iloc[[0, 3, 5]] = np.nan + tm.assert_series_equal(ts.count(level=1), right - 1) + + # GH#29478 + with pd.option_context("use_inf_as_na", True): + assert pd.Series([pd.Timestamp("1990/1/1")]).count() == 1 + + def test_count_categorical(self): + + ser = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) + result = ser.count() + assert result == 2 diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py new file mode 100644 index 0000000000000..1f6033d435323 --- /dev/null +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import Series, isna +import pandas._testing as tm + + +class TestSeriesCov: + def test_cov(self, datetime_series): + # full overlap + tm.assert_almost_equal( + datetime_series.cov(datetime_series), datetime_series.std() ** 2 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].cov(datetime_series[5:]), + datetime_series[5:15].std() ** 2, + ) + + # No overlap + assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.cov(cp)) + + # min_periods + assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.cov(ts2, min_periods=12)) + + +class TestSeriesCorr: + @td.skip_if_no_scipy + def test_corr(self, datetime_series): + import scipy.stats as stats + + # full overlap + tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) + + # partial overlap + tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) + + assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) + + ts1 = datetime_series[:15].reindex(datetime_series.index) + ts2 = datetime_series[5:].reindex(datetime_series.index) + assert isna(ts1.corr(ts2, min_periods=12)) + + # No overlap + assert np.isnan(datetime_series[::2].corr(datetime_series[1::2])) + + # all NA + cp = datetime_series[:10].copy() + cp[:] = np.nan + assert isna(cp.corr(cp)) + + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + result = A.corr(B) + expected, _ = stats.pearsonr(A, B) + tm.assert_almost_equal(result, expected) + + @td.skip_if_no_scipy + def test_corr_rank(self): + import scipy.stats as stats + + # kendall and spearman + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + A[-5:] = A[:5] + result = A.corr(B, method="kendall") + expected = stats.kendalltau(A, B)[0] + tm.assert_almost_equal(result, expected) + + result = A.corr(B, method="spearman") + expected = stats.spearmanr(A, B)[0] + tm.assert_almost_equal(result, expected) + + # results from R + A = Series( + [ + -0.89926396, + 0.94209606, + -1.03289164, + -0.95445587, + 0.76910310, + -0.06430576, + -2.09704447, + 0.40660407, + -0.89926396, + 0.94209606, + ] + ) + B = Series( + [ + -1.01270225, + -0.62210117, + -1.56895827, + 0.59592943, + -0.01680292, + 1.17258718, + -1.06009347, + -0.10222060, + -0.89076239, + 0.89372375, + ] + ) + kexp = 0.4319297 + sexp = 0.5853767 + tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) + tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) + + def test_corr_invalid_method(self): + # GH PR #22298 + s1 = pd.Series(np.random.randn(10)) + s2 = pd.Series(np.random.randn(10)) + msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " + with pytest.raises(ValueError, match=msg): + s1.corr(s2, method="____") + + def test_corr_callable_method(self, datetime_series): + # simple correlation example + # returns 1 if exact equality, 0 otherwise + my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 + + # simple example + s1 = Series([1, 2, 3, 4, 5]) + s2 = Series([5, 4, 3, 2, 1]) + expected = 0 + tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) + + # full overlap + tm.assert_almost_equal( + datetime_series.corr(datetime_series, method=my_corr), 1.0 + ) + + # partial overlap + tm.assert_almost_equal( + datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 + ) + + # No overlap + assert np.isnan( + datetime_series[::2].corr(datetime_series[1::2], method=my_corr) + ) + + # dataframe example + df = pd.DataFrame([s1, s2]) + expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) + tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py new file mode 100644 index 0000000000000..b147a04b11090 --- /dev/null +++ b/pandas/tests/series/methods/test_describe.py @@ -0,0 +1,69 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +import pandas._testing as tm + + +class TestSeriesDescribe: + def test_describe(self): + s = Series([0, 1, 2, 3, 4], name="int_data") + result = s.describe() + expected = Series( + [5, 2, s.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + s = Series([True, True, False, False, False], name="bool_data") + result = s.describe() + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + s = Series(["a", "a", "b", "c", "d"], name="str_data") + result = s.describe() + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) + tm.assert_series_equal(result, expected) + + def test_describe_empty_object(self): + # https://github.com/pandas-dev/pandas/issues/27183 + s = Series([None, None], dtype=object) + result = s.describe() + expected = Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + + result = s[:0].describe() + tm.assert_series_equal(result, expected) + # ensure NaN, not None + assert np.isnan(result.iloc[2]) + assert np.isnan(result.iloc[3]) + + def test_describe_with_tz(self, tz_naive_fixture): + # GH 21332 + tz = tz_naive_fixture + name = str(tz_naive_fixture) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + result = s.describe() + expected = Series( + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + name=name, + index=["count", "unique", "top", "freq", "first", "last"], + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_diff.py b/pandas/tests/series/methods/test_diff.py new file mode 100644 index 0000000000000..033f75e95f11b --- /dev/null +++ b/pandas/tests/series/methods/test_diff.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest + +from pandas import Series, TimedeltaIndex, date_range +import pandas._testing as tm + + +class TestSeriesDiff: + def test_diff_np(self): + pytest.skip("skipping due to Series no longer being an ndarray") + + # no longer works as the return type of np.diff is now nd.array + s = Series(np.arange(5)) + + r = np.diff(s) + tm.assert_series_equal(Series([np.nan, 0, 0, 0, np.nan]), r) + + def test_diff_int(self): + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + result = s.diff() + assert result[1] == 1 + + def test_diff_tz(self): + # Combined datetime diff, normal diff and boolean diff test + ts = tm.makeTimeSeries(name="ts") + ts.diff() + + # neg n + result = ts.diff(-1) + expected = ts - ts.shift(-1) + tm.assert_series_equal(result, expected) + + # 0 + result = ts.diff(0) + expected = ts - ts + tm.assert_series_equal(result, expected) + + # datetime diff (GH#3100) + s = Series(date_range("20130102", periods=5)) + result = s.diff() + expected = s - s.shift(1) + tm.assert_series_equal(result, expected) + + # timedelta diff + result = result - result.shift(1) # previous result + expected = expected.diff() # previously expected + tm.assert_series_equal(result, expected) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input,output,diff", + [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], + ) + def test_diff_bool(self, input, output, diff): + # boolean series (test for fixing #17294) + s = Series(input) + result = s.diff() + expected = Series(output) + tm.assert_series_equal(result, expected) + + def test_diff_object_dtype(self): + # object series + s = Series([False, True, 5.0, np.nan, True, False]) + result = s.diff() + expected = s - s.shift(1) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000..2d052505d5ecc --- /dev/null +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest + +from pandas import Categorical, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) +def test_drop_duplicates(any_numpy_dtype, keep, expected): + tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) + + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +class TestSeriesDropDuplicates: + @pytest.mark.parametrize( + "dtype", + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], + ) + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + # Test case 1 + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, True]) + tm.assert_series_equal(tc1.duplicated(), expected) + tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, False]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc1.duplicated(keep=False), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + # Test case 2 + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(), expected) + tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, False, False, False]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(keep=False), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_bool(self, ordered_fixture): + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py new file mode 100644 index 0000000000000..5cc297913e851 --- /dev/null +++ b/pandas/tests/series/methods/test_duplicated.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) +def test_duplicated_keep(keep, expected): + ser = Series(["a", "b", "b", "c", "a"], name="name") + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/methods/test_explode.py similarity index 98% rename from pandas/tests/series/test_explode.py rename to pandas/tests/series/methods/test_explode.py index 6262da6bdfabf..979199e1efc62 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_basic(): @@ -29,7 +29,7 @@ def test_mixed_type(): def test_empty(): - s = pd.Series() + s = pd.Series(dtype=object) result = s.explode() expected = s.copy() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py new file mode 100644 index 0000000000000..ca93e989ba6b5 --- /dev/null +++ b/pandas/tests/series/methods/test_isin.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import Series, date_range +import pandas._testing as tm + + +class TestSeriesIsIn: + def test_isin(self): + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + + result = s.isin(["A", "C"]) + expected = Series([True, False, True, False, False, False, True, True]) + tm.assert_series_equal(result, expected) + + # GH#16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list("abcdefghijk" * 10 ** 5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 + + assert s.isin(in_list).sum() == 200000 + + def test_isin_with_string_scalar(self): + # GH#4763 + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[str\]" + ) + with pytest.raises(TypeError, match=msg): + s.isin("a") + + s = Series(["aaa", "b", "c"]) + with pytest.raises(TypeError, match=msg): + s.isin("aaa") + + def test_isin_with_i8(self): + # GH#5021 + + expected = Series([True, True, False, False, False]) + expected2 = Series([False, True, False, False, False]) + + # datetime64[ns] + s = Series(date_range("jan-01-2013", "jan-05-2013")) + + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + result = s.isin(s[0:2].values) + tm.assert_series_equal(result, expected) + + # fails on dtype conversion in the first place + result = s.isin(s[0:2].values.astype("datetime64[D]")) + tm.assert_series_equal(result, expected) + + result = s.isin([s[1]]) + tm.assert_series_equal(result, expected2) + + result = s.isin([np.datetime64(s[1])]) + tm.assert_series_equal(result, expected2) + + result = s.isin(set(s[0:2])) + tm.assert_series_equal(result, expected) + + # timedelta64[ns] + s = Series(pd.to_timedelta(range(5), unit="d")) + result = s.isin(s[0:2]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) + def test_isin_empty(self, empty): + # see GH#16991 + s = Series(["a", "b"]) + expected = Series([False, False]) + + result = s.isin(empty) + tm.assert_series_equal(expected, result) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py new file mode 100644 index 0000000000000..a029965c7394f --- /dev/null +++ b/pandas/tests/series/methods/test_nlargest.py @@ -0,0 +1,213 @@ +""" +Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" +but are implicitly also testing nsmallest_foo. +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +from pandas import Series +import pandas._testing as tm + +main_dtypes = [ + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", +] + + +@pytest.fixture +def s_main_dtypes(): + """ + A DataFrame with many dtypes + + * datetime + * datetimetz + * timedelta + * [u]int{8,16,32,64} + * float{32,64} + + The columns are the name of the dtype. + """ + df = pd.DataFrame( + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: + df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) + + return df + + +@pytest.fixture(params=main_dtypes) +def s_main_dtypes_split(request, s_main_dtypes): + """Each series in s_main_dtypes.""" + return s_main_dtypes[request.param] + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + ser = Series(vals, dtype=dtype) + result = getattr(ser, method)(3) + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] + expected = ser.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + +class TestSeriesNLargestNSmallest: + @pytest.mark.parametrize( + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) + def test_nlargest_error(self, r): + dt = r.dtype + msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) + args = 2, len(r), 0, -1 + methods = r.nlargest, r.nsmallest + for method, arg in product(methods, args): + with pytest.raises(TypeError, match=msg): + method(arg) + + def test_nsmallest_nlargest(self, s_main_dtypes_split): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + ser = s_main_dtypes_split + + tm.assert_series_equal(ser.nsmallest(2), ser.iloc[[2, 1]]) + tm.assert_series_equal(ser.nsmallest(2, keep="last"), ser.iloc[[2, 3]]) + + empty = ser.iloc[0:0] + tm.assert_series_equal(ser.nsmallest(0), empty) + tm.assert_series_equal(ser.nsmallest(-1), empty) + tm.assert_series_equal(ser.nlargest(0), empty) + tm.assert_series_equal(ser.nlargest(-1), empty) + + tm.assert_series_equal(ser.nsmallest(len(ser)), ser.sort_values()) + tm.assert_series_equal(ser.nsmallest(len(ser) + 1), ser.sort_values()) + tm.assert_series_equal(ser.nlargest(len(ser)), ser.iloc[[4, 0, 1, 3, 2]]) + tm.assert_series_equal(ser.nlargest(len(ser) + 1), ser.iloc[[4, 0, 1, 3, 2]]) + + def test_nlargest_misc(self): + + ser = Series([3.0, np.nan, 1, 2, 5]) + tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]]) + tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]]) + + msg = 'keep must be either "first", "last"' + with pytest.raises(ValueError, match=msg): + ser.nsmallest(keep="invalid") + with pytest.raises(ValueError, match=msg): + ser.nlargest(keep="invalid") + + # GH#15297 + ser = Series([1] * 5, index=[1, 2, 3, 4, 5]) + expected_first = Series([1] * 3, index=[1, 2, 3]) + expected_last = Series([1] * 3, index=[5, 4, 3]) + + result = ser.nsmallest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nsmallest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + result = ser.nlargest(3) + tm.assert_series_equal(result, expected_first) + + result = ser.nlargest(3, keep="last") + tm.assert_series_equal(result, expected_last) + + @pytest.mark.parametrize("n", range(1, 5)) + def test_nlargest_n(self, n): + + # GH 13412 + ser = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) + result = ser.nlargest(n) + expected = ser.sort_values(ascending=False).head(n) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(n) + expected = ser.sort_values().head(n) + tm.assert_series_equal(result, expected) + + def test_nlargest_boundary_integer(self, nselect_method, any_int_dtype): + # GH#21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_nlargest_boundary_float(self, nselect_method, float_dtype): + # GH#21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) + def test_nlargest_boundary_datetimelike(self, nselect_method, dtype): + # GH#21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo("int64") + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + + def test_nlargest_duplicate_keep_all_ties(self): + # see GH#16818 + ser = Series([10, 9, 8, 7, 7, 7, 7, 6]) + result = ser.nlargest(4, keep="all") + expected = Series([10, 9, 8, 7, 7, 7, 7]) + tm.assert_series_equal(result, expected) + + result = ser.nsmallest(2, keep="all") + expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) + def test_nlargest_boolean(self, data, expected): + # GH#26154 : ensure True > False + ser = Series(data) + result = ser.nlargest(1) + expected = Series(expected) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py new file mode 100644 index 0000000000000..1efb57894f986 --- /dev/null +++ b/pandas/tests/series/methods/test_pct_change.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +from pandas import Series, date_range +import pandas._testing as tm + + +class TestSeriesPctChange: + def test_pct_change(self, datetime_series): + rs = datetime_series.pct_change(fill_method=None) + tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) + + rs = datetime_series.pct_change(2) + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.fillna(method="bfill", limit=1) + tm.assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_with_duplicate_axis(self): + # GH#28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + tm.assert_series_equal(chg, expected) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): + # GH#7292 + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=datetime_series.index, dtype=object) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_series_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) + result = s.pct_change(fill_method=fill_method) + expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/methods/test_quantile.py similarity index 97% rename from pandas/tests/series/test_quantile.py rename to pandas/tests/series/methods/test_quantile.py index 1a4a3f523cbbe..79f50afca658f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Index, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import Timestamp -import pandas.util.testing as tm class TestSeriesQuantile: @@ -67,7 +67,7 @@ def test_quantile_multi(self, datetime_series): result = datetime_series.quantile([]) expected = pd.Series( - [], name=datetime_series.name, index=Index([], dtype=float) + [], name=datetime_series.name, index=Index([], dtype=float), dtype="float64" ) tm.assert_series_equal(result, expected) @@ -104,7 +104,8 @@ def test_quantile_nan(self): assert result == expected # all nan/empty - cases = [Series([]), Series([np.nan, np.nan])] + s1 = Series([], dtype=object) + cases = [s1, Series([np.nan, np.nan])] for s in cases: res = s.quantile(0.5) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/methods/test_rank.py similarity index 99% rename from pandas/tests/series/test_rank.py rename to pandas/tests/series/methods/test_rank.py index 793e8b7da4965..3d4688c8274f9 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import NaT, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import CategoricalDtype -import pandas.util.testing as tm class TestSeriesRank: @@ -203,8 +203,7 @@ def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = ( - "No axis named average for object type" - " " + "No axis named average for object type " ) with pytest.raises(ValueError, match=msg): s.rank("average") diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/methods/test_replace.py similarity index 84% rename from pandas/tests/series/test_replace.py rename to pandas/tests/series/methods/test_replace.py index e9d5a4b105a35..b20baa2836363 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesReplace: @@ -245,7 +245,10 @@ def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace(dict())) - tm.assert_series_equal(s, s.replace(pd.Series([]))) + + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty_series = pd.Series([]) + tm.assert_series_equal(s, s.replace(empty_series)) def test_replace_string_with_number(self): # GH 15743 @@ -293,6 +296,29 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + s = pd.Series(dti) + c = s.astype("category") + + expected = c.copy() + expected = expected.cat.add_categories("foo") + expected[2] = "foo" + expected = expected.cat.remove_unused_categories() + assert c[2] != "foo" + + result = c.replace(c[2], "foo") + tm.assert_series_equal(expected, result) + assert c[2] != "foo" # ensure non-inplace call does not alter original + + c.replace(c[2], "foo", inplace=True) + tm.assert_series_equal(expected, c) + + first_value = c[0] + c.replace(c[1], c[0], inplace=True) + assert c[0] == c[1] == first_value # test replacing with existing value + def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError @@ -305,3 +331,34 @@ def test_replace_with_no_overflowerror(self): result = s.replace(["100000000000000000000"], [1]) expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, to_replace, exp", + [ + ([1, 2, 3], {1: 2, 2: 3, 3: 4}, [2, 3, 4]), + (["1", "2", "3"], {"1": "2", "2": "3", "3": "4"}, ["2", "3", "4"]), + ], + ) + def test_replace_commutative(self, ser, to_replace, exp): + # GH 16051 + # DataFrame.replace() overwrites when values are non-numeric + + series = pd.Series(ser) + + expected = pd.Series(exp) + result = series.replace(to_replace) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "ser, exp", [([1, 2, 3], [1, True, 3]), (["x", 2, 3], ["x", True, 3])] + ) + def test_replace_no_cast(self, ser, exp): + # GH 9113 + # BUG: replace int64 dtype with bool coerces to int64 + + series = pd.Series(ser) + result = series.replace(2, True) + expected = pd.Series(exp) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py new file mode 100644 index 0000000000000..7f0711a0f30d7 --- /dev/null +++ b/pandas/tests/series/methods/test_round.py @@ -0,0 +1,46 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesRound: + def test_round(self, datetime_series): + datetime_series.index.name = "index_name" + result = datetime_series.round(2) + expected = Series( + np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" + ) + tm.assert_series_equal(result, expected) + assert result.name == datetime_series.name + + def test_round_numpy(self): + # See GH#12600 + ser = Series([1.53, 1.36, 0.06]) + out = np.round(ser, decimals=0) + expected = Series([2.0, 1.0, 0.0]) + tm.assert_series_equal(out, expected) + + msg = "the 'out' parameter is not supported" + with pytest.raises(ValueError, match=msg): + np.round(ser, decimals=0, out=ser) + + def test_round_numpy_with_nan(self): + # See GH#14197 + ser = Series([1.53, np.nan, 0.06]) + with tm.assert_produces_warning(None): + result = ser.round() + expected = Series([2.0, np.nan, 0.0]) + tm.assert_series_equal(result, expected) + + def test_round_builtin(self): + ser = Series([1.123, 2.123, 3.123], index=range(3)) + result = round(ser) + expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3)) + tm.assert_series_equal(result, expected_rounded0) + + decimals = 2 + expected_rounded = Series([1.12, 2.12, 3.12], index=range(3)) + result = round(ser, decimals) + tm.assert_series_equal(result, expected_rounded) diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py new file mode 100644 index 0000000000000..fd6c6f74a9136 --- /dev/null +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -0,0 +1,55 @@ +import numpy as np + +from pandas import Series, Timestamp, date_range +import pandas._testing as tm +from pandas.api.types import is_scalar + + +class TestSeriesSearchSorted: + def test_searchsorted(self): + ser = Series([1, 2, 3]) + + result = ser.searchsorted(1, side="left") + assert is_scalar(result) + assert result == 0 + + result = ser.searchsorted(1, side="right") + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted(30) + assert is_scalar(res) + assert res == 2 + + res = ser.searchsorted([30]) + exp = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_numeric_dtypes_vector(self): + ser = Series([1, 2, 90, 1000, 3e9]) + res = ser.searchsorted([91, 2e6]) + exp = np.array([3, 4], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_datetime64_scalar(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + val = Timestamp("20120102") + res = ser.searchsorted(val) + assert is_scalar(res) + assert res == 1 + + def test_searchsorted_datetime64_list(self): + ser = Series(date_range("20120101", periods=10, freq="2D")) + vals = [Timestamp("20120102"), Timestamp("20120104")] + res = ser.searchsorted(vals) + exp = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) + + def test_searchsorted_sorter(self): + # GH8490 + ser = Series([3, 1, 2]) + res = ser.searchsorted([0, 3], sorter=np.argsort(ser)) + exp = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(res, exp) diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py new file mode 100644 index 0000000000000..8256e2f33b936 --- /dev/null +++ b/pandas/tests/series/methods/test_shift.py @@ -0,0 +1,265 @@ +import numpy as np +import pytest + +from pandas.errors import NullFrequencyError + +import pandas as pd +from pandas import ( + DatetimeIndex, + Index, + NaT, + Series, + TimedeltaIndex, + date_range, + offsets, +) +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestShift: + def test_shift(self, datetime_series): + shifted = datetime_series.shift(1) + unshifted = shifted.shift(-1) + + tm.assert_index_equal(shifted.index, datetime_series.index) + tm.assert_index_equal(unshifted.index, datetime_series.index) + tm.assert_numpy_array_equal( + unshifted.dropna().values, datetime_series.values[:-1] + ) + + offset = BDay() + shifted = datetime_series.shift(1, freq=offset) + unshifted = shifted.shift(-1, freq=offset) + + tm.assert_series_equal(unshifted, datetime_series) + + unshifted = datetime_series.shift(0, freq=offset) + tm.assert_series_equal(unshifted, datetime_series) + + shifted = datetime_series.shift(1, freq="B") + unshifted = shifted.shift(-1, freq="B") + + tm.assert_series_equal(unshifted, datetime_series) + + # corner case + unshifted = datetime_series.shift(0) + tm.assert_series_equal(unshifted, datetime_series) + + # Shifting with PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) + + shifted2 = ps.shift(1, "B") + shifted3 = ps.shift(1, BDay()) + tm.assert_series_equal(shifted2, shifted3) + tm.assert_series_equal(ps, shifted2.shift(-1, "B")) + + msg = "Given freq D does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="D") + + # legacy support + shifted4 = ps.shift(1, freq="B") + tm.assert_series_equal(shifted2, shifted4) + + shifted5 = ps.shift(1, freq=BDay()) + tm.assert_series_equal(shifted5, shifted4) + + # 32-bit taking + # GH#8129 + index = date_range("2000-01-01", periods=5) + for dtype in ["int32", "int64"]: + s1 = Series(np.arange(5, dtype=dtype), index=index) + p = s1.iloc[1] + result = s1.shift(periods=p) + expected = Series([np.nan, 0, 1, 2, 3], index=index) + tm.assert_series_equal(result, expected) + + # GH#8260 + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s - s.shift() + + exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + tm.assert_series_equal(result, exp) + + # incompat tz + s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") + msg = "DatetimeArray subtraction must have the same timezones or no timezones" + with pytest.raises(TypeError, match=msg): + s - s2 + + def test_shift2(self): + ts = Series( + np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") + ) + + result = ts.shift(1, freq="5T") + exp_index = ts.index.shift(1, freq="5T") + tm.assert_index_equal(result.index, exp_index) + + # GH#1063, multiple of same base + result = ts.shift(1, freq="4H") + exp_index = ts.index + offsets.Hour(4) + tm.assert_index_equal(result.index, exp_index) + + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) + msg = "Cannot shift with no freq" + with pytest.raises(NullFrequencyError, match=msg): + idx.shift(1) + + def test_shift_fill_value(self): + # GH#24128 + ts = Series( + [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + + exp = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + # check that fill value works + result = ts.shift(1, fill_value=0.0) + tm.assert_series_equal(result, exp) + + exp = Series( + [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") + ) + result = ts.shift(2, fill_value=0.0) + tm.assert_series_equal(result, exp) + + ts = pd.Series([1, 2, 3]) + res = ts.shift(2, fill_value=0) + assert res.dtype == ts.dtype + + def test_shift_categorical_fill_value(self): + ts = pd.Series(["a", "b", "c", "d"], dtype="category") + res = ts.shift(1, fill_value="a") + expected = pd.Series( + pd.Categorical( + ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + ) + tm.assert_equal(res, expected) + + # check for incorrect fill_value + msg = "'fill_value=f' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + ts.shift(1, fill_value="f") + + def test_shift_dst(self): + # GH#13926 + dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") + s = Series(dates) + + res = s.shift(0) + tm.assert_series_equal(res, s) + assert res.dtype == "datetime64[ns, US/Eastern]" + + res = s.shift(1) + exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + res = s.shift(-2) + exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + for ex in [10, -10, 20, -20]: + res = s.shift(ex) + exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") + tm.assert_series_equal(res, exp) + assert res.dtype == "datetime64[ns, US/Eastern]" + + def test_tshift(self, datetime_series): + # PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_series_equal(unshifted, ps) + + shifted2 = ps.tshift(freq="B") + tm.assert_series_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=BDay()) + tm.assert_series_equal(shifted, shifted3) + + msg = "Given freq M does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.tshift(freq="M") + + # DatetimeIndex + shifted = datetime_series.tshift(1) + unshifted = shifted.tshift(-1) + + tm.assert_series_equal(datetime_series, unshifted) + + shifted2 = datetime_series.tshift(freq=datetime_series.index.freq) + tm.assert_series_equal(shifted, shifted2) + + inferred_ts = Series( + datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" + ) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + tm.assert_series_equal(shifted, datetime_series.tshift(1)) + tm.assert_series_equal(unshifted, inferred_ts) + + no_freq = datetime_series[[0, 5, 7]] + msg = "Freq was not given and was not set in the index" + with pytest.raises(ValueError, match=msg): + no_freq.tshift() + + def test_shift_int(self, datetime_series): + ts = datetime_series.astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + tm.assert_series_equal(shifted, expected) + + def test_shift_object_non_scalar_fill(self): + # shift requires scalar fill_value except for object dtype + ser = Series(range(3)) + with pytest.raises(ValueError, match="fill_value must be a scalar"): + ser.shift(1, fill_value=[]) + + df = ser.to_frame() + with pytest.raises(ValueError, match="fill_value must be a scalar"): + df.shift(1, fill_value=np.arange(3)) + + obj_ser = ser.astype(object) + result = obj_ser.shift(1, fill_value={}) + assert result[0] == {} + + obj_df = obj_ser.to_frame() + result = obj_df.shift(1, fill_value={}) + assert result.iloc[0, 0] == {} + + def test_shift_categorical(self): + # GH#9416 + s = pd.Series(["a", "b", "c", "d"], dtype="category") + + tm.assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) + + sp1 = s.shift(1) + tm.assert_index_equal(s.index, sp1.index) + assert np.all(sp1.values.codes[:1] == -1) + assert np.all(s.values.codes[:-1] == sp1.values.codes[1:]) + + sn2 = s.shift(-2) + tm.assert_index_equal(s.index, sn2.index) + assert np.all(sn2.values.codes[-2:] == -1) + assert np.all(s.values.codes[2:] == sn2.values.codes[:-2]) + + tm.assert_index_equal(s.values.categories, sp1.values.categories) + tm.assert_index_equal(s.values.categories, sn2.values.categories) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py new file mode 100644 index 0000000000000..6fa4eeaee34c0 --- /dev/null +++ b/pandas/tests/series/methods/test_sort_index.py @@ -0,0 +1,168 @@ +import random + +import numpy as np +import pytest + +from pandas import IntervalIndex, MultiIndex, Series +import pandas._testing as tm + + +class TestSeriesSortIndex: + def test_sort_index(self, datetime_series): + rindex = list(datetime_series.index) + random.shuffle(rindex) + + random_order = datetime_series.reindex(rindex) + sorted_series = random_order.sort_index() + tm.assert_series_equal(sorted_series, datetime_series) + + # descending + sorted_series = random_order.sort_index(ascending=False) + tm.assert_series_equal( + sorted_series, datetime_series.reindex(datetime_series.index[::-1]) + ) + + # compat on level + sorted_series = random_order.sort_index(level=0) + tm.assert_series_equal(sorted_series, datetime_series) + + # compat on axis + sorted_series = random_order.sort_index(axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + msg = "No axis named 1 for object type " + with pytest.raises(ValueError, match=msg): + random_order.sort_values(axis=1) + + sorted_series = random_order.sort_index(level=0, axis=0) + tm.assert_series_equal(sorted_series, datetime_series) + + with pytest.raises(ValueError, match=msg): + random_order.sort_index(level=0, axis=1) + + def test_sort_index_inplace(self, datetime_series): + + # For GH#11402 + rindex = list(datetime_series.index) + random.shuffle(rindex) + + # descending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=False, inplace=True) + + assert result is None + tm.assert_series_equal( + random_order, datetime_series.reindex(datetime_series.index[::-1]) + ) + + # ascending + random_order = datetime_series.reindex(rindex) + result = random_order.sort_index(ascending=True, inplace=True) + + assert result is None + tm.assert_series_equal(random_order, datetime_series) + + def test_sort_index_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + res = s.sort_index(level="A") + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level=["A", "B"]) + tm.assert_series_equal(backwards, res) + + res = s.sort_index(level="A", sort_remaining=False) + tm.assert_series_equal(s, res) + + res = s.sort_index(level=["A", "B"], sort_remaining=False) + tm.assert_series_equal(s, res) + + @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 + def test_sort_index_multiindex(self, level): + + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + # implicit sort_remaining=True + res = s.sort_index(level=level) + tm.assert_series_equal(backwards, res) + + # GH#13496 + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) + tm.assert_series_equal(s, res) + + def test_sort_index_kind(self): + # GH#14444 & GH#13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort") + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort") + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort") + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position(self): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first") + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last") + tm.assert_series_equal(expected_series_last, index_sorted_series) + + def test_sort_index_intervals(self): + s = Series( + [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) + ) + + result = s.sort_index() + expected = s + tm.assert_series_equal(result, expected) + + result = s.sort_index(ascending=False) + expected = Series( + [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ascending, ignore_index, output_index", + [ + ([2, 3, 6, 1], [2, 3, 6, 1], True, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [2, 3, 6, 1], True, False, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, False, [3, 2, 1, 0]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_list, sorted_list, ascending, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_ser = ser.copy() + result_ser.sort_index(**kwargs) + else: + result_ser = ser.sort_index(**kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/methods/test_sort_values.py similarity index 57% rename from pandas/tests/series/test_sorting.py rename to pandas/tests/series/methods/test_sort_values.py index 8039b133cae10..caa2abd61af6a 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -1,13 +1,11 @@ -import random - import numpy as np import pytest -from pandas import Categorical, DataFrame, IntervalIndex, MultiIndex, Series -import pandas.util.testing as tm +from pandas import Categorical, DataFrame, Series +import pandas._testing as tm -class TestSeriesSorting: +class TestSeriesSortValues: def test_sort_values(self, datetime_series): # check indexes are reordered corresponding with the values @@ -73,128 +71,18 @@ def test_sort_values(self, datetime_series): ts.index, datetime_series.sort_values(ascending=False).index ) - # GH 5856/5853 + # GH#5856/5853 # Series.sort_values operating on a view df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0] msg = ( - "This Series is a view of some other array, to sort in-place" - " you must create a copy" + "This Series is a view of some other array, to sort in-place " + "you must create a copy" ) with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) - def test_sort_index(self, datetime_series): - rindex = list(datetime_series.index) - random.shuffle(rindex) - - random_order = datetime_series.reindex(rindex) - sorted_series = random_order.sort_index() - tm.assert_series_equal(sorted_series, datetime_series) - - # descending - sorted_series = random_order.sort_index(ascending=False) - tm.assert_series_equal( - sorted_series, datetime_series.reindex(datetime_series.index[::-1]) - ) - - # compat on level - sorted_series = random_order.sort_index(level=0) - tm.assert_series_equal(sorted_series, datetime_series) - - # compat on axis - sorted_series = random_order.sort_index(axis=0) - tm.assert_series_equal(sorted_series, datetime_series) - - msg = "No axis named 1 for object type " - with pytest.raises(ValueError, match=msg): - random_order.sort_values(axis=1) - - sorted_series = random_order.sort_index(level=0, axis=0) - tm.assert_series_equal(sorted_series, datetime_series) - - with pytest.raises(ValueError, match=msg): - random_order.sort_index(level=0, axis=1) - - def test_sort_index_inplace(self, datetime_series): - - # For #11402 - rindex = list(datetime_series.index) - random.shuffle(rindex) - - # descending - random_order = datetime_series.reindex(rindex) - result = random_order.sort_index(ascending=False, inplace=True) - - assert result is None - tm.assert_series_equal( - random_order, datetime_series.reindex(datetime_series.index[::-1]) - ) - - # ascending - random_order = datetime_series.reindex(rindex) - result = random_order.sort_index(ascending=True, inplace=True) - - assert result is None - tm.assert_series_equal(random_order, datetime_series) - - @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 - def test_sort_index_multiindex(self, level): - - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) - s = Series([1, 2], mi) - backwards = s.iloc[[1, 0]] - - # implicit sort_remaining=True - res = s.sort_index(level=level) - tm.assert_series_equal(backwards, res) - - # GH13496 - # sort has no effect without remaining lvls - res = s.sort_index(level=level, sort_remaining=False) - tm.assert_series_equal(s, res) - - def test_sort_index_kind(self): - # GH #14444 & #13589: Add support for sort algo choosing - series = Series(index=[3, 2, 1, 4, 3]) - expected_series = Series(index=[1, 2, 3, 3, 4]) - - index_sorted_series = series.sort_index(kind="mergesort") - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="quicksort") - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="heapsort") - tm.assert_series_equal(expected_series, index_sorted_series) - - def test_sort_index_na_position(self): - series = Series(index=[3, 2, 1, 4, 3, np.nan]) - - expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4]) - index_sorted_series = series.sort_index(na_position="first") - tm.assert_series_equal(expected_series_first, index_sorted_series) - - expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan]) - index_sorted_series = series.sort_index(na_position="last") - tm.assert_series_equal(expected_series_last, index_sorted_series) - - def test_sort_index_intervals(self): - s = Series( - [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) - ) - - result = s.sort_index() - expected = s - tm.assert_series_equal(result, expected) - - result = s.sort_index(ascending=False) - expected = Series( - [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) - ) - tm.assert_series_equal(result, expected) - def test_sort_values_categorical(self): c = Categorical(["a", "b", "b", "a"], ordered=False) @@ -252,7 +140,7 @@ def test_sort_values_categorical(self): df.sort_values(by=["unsort"], ascending=False) # multi-columns sort - # GH 7848 + # GH#7848 df = DataFrame( {"id": [6, 5, 4, 3, 2, 1], "raw_grade": ["a", "b", "b", "a", "a", "e"]} ) @@ -268,3 +156,28 @@ def test_sort_values_categorical(self): result = df.sort_values(by=["grade", "id"]) expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ignore_index, output_index", + [ + ([2, 3, 6, 1], [6, 3, 2, 1], True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [6, 3, 2, 1], False, [2, 1, 0, 3]), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_list, sorted_list, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_ser = ser.copy() + result_ser.sort_values(ascending=False, **kwargs) + else: + result_ser = ser.sort_values(ascending=False, **kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/methods/test_to_dict.py b/pandas/tests/series/methods/test_to_dict.py new file mode 100644 index 0000000000000..2fbf3e8d39cf3 --- /dev/null +++ b/pandas/tests/series/methods/test_to_dict.py @@ -0,0 +1,20 @@ +import collections + +import pytest + +from pandas import Series +import pandas._testing as tm + + +class TestSeriesToDict: + @pytest.mark.parametrize( + "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) + ) + def test_to_dict(self, mapping, datetime_series): + # GH#16122 + tm.assert_series_equal( + Series(datetime_series.to_dict(mapping), name="ts"), datetime_series + ) + from_method = Series(datetime_series.to_dict(collections.Counter)) + from_constructor = Series(collections.Counter(datetime_series.items())) + tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py new file mode 100644 index 0000000000000..d4e2890ed8bf0 --- /dev/null +++ b/pandas/tests/series/methods/test_truncate.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +from pandas.tseries.offsets import BDay + + +class TestTruncate: + def test_truncate(self, datetime_series): + offset = BDay() + + ts = datetime_series[::3] + + start, end = datetime_series.index[3], datetime_series.index[6] + start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] + + # neither specified + truncated = ts.truncate() + tm.assert_series_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + tm.assert_series_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + tm.assert_series_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + tm.assert_series_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + tm.assert_series_equal(truncated, expected) + + # corner case, empty series returned + truncated = ts.truncate(after=datetime_series.index[0] - offset) + assert len(truncated) == 0 + + truncated = ts.truncate(before=datetime_series.index[-1] + offset) + assert len(truncated) == 0 + + msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" + with pytest.raises(ValueError, match=msg): + ts.truncate( + before=datetime_series.index[-1] + offset, + after=datetime_series.index[0] - offset, + ) + + def test_truncate_nonsortedindex(self): + # GH#17935 + + s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" + + with pytest.raises(ValueError, match=msg): + s.truncate(before=3, after=9) + + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") + ts = pd.Series(np.random.randn(len(rng)), index=rng) + msg = "truncate requires a sorted index" + + with pytest.raises(ValueError, match=msg): + ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py new file mode 100644 index 0000000000000..fdb35befeb0c2 --- /dev/null +++ b/pandas/tests/series/methods/test_value_counts.py @@ -0,0 +1,179 @@ +import numpy as np + +import pandas as pd +from pandas import Categorical, CategoricalIndex, Series +import pandas._testing as tm + + +class TestSeriesValueCounts: + def test_value_counts_datetime(self): + # most dtypes are tested in tests/base + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_datetime_tz(self): + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + idx = pd.DatetimeIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_period(self): + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check DatetimeIndex outputs the same result + idx = pd.PeriodIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_ordered(self): + # most dtypes are tested in tests/base + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical_not_ordered(self): + values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) + + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + ser = pd.Series(values, name="xxx") + tm.assert_series_equal(ser.value_counts(), exp) + # check CategoricalIndex outputs the same result + idx = pd.CategoricalIndex(values, name="xxx") + tm.assert_series_equal(idx.value_counts(), exp) + + # normalize + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") + tm.assert_series_equal(ser.value_counts(normalize=True), exp) + tm.assert_series_equal(idx.value_counts(normalize=True), exp) + + def test_value_counts_categorical(self): + # GH#12835 + cats = Categorical(list("abcccb"), categories=list("cabd")) + ser = Series(cats, name="xxx") + res = ser.value_counts(sort=False) + + exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(sort=True) + + exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) + tm.assert_series_equal(res, exp) + + # check object dtype handles the Series.name as the same + # (tested in tests/base) + ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx") + res = ser.value_counts() + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) + tm.assert_series_equal(res, exp) + + def test_value_counts_categorical_with_nan(self): + # see GH#9443 + + # sanity check + ser = Series(["a", "b", "a"], dtype="category") + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # same Series via two different constructions --> same behaviour + series = [ + Series(["a", "b", None, "a", None, None], dtype="category"), + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), + ] + + for ser in series: + # None is a NaN value, so we exclude its count here + exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) + res = ser.value_counts(dropna=True) + tm.assert_series_equal(res, exp) + + # we don't exclude the count of None and sort by counts + exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) + res = ser.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) + res = ser.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 5d74ad95be90d..628c66583535d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAlterAxes: @@ -19,8 +19,8 @@ def test_setindex(self, string_series): # wrong length msg = ( - "Length mismatch: Expected axis has 30 elements, new" - " values have 29 elements" + "Length mismatch: Expected axis has 30 elements, " + "new values have 29 elements" ) with pytest.raises(ValueError, match=msg): string_series.index = np.arange(len(string_series) - 1) @@ -83,8 +83,9 @@ def test_rename_axis_supported(self): s = Series(range(5)) s.rename({}, axis=0) s.rename({}, axis="index") - with pytest.raises(ValueError, match="No axis named 5"): - s.rename({}, axis=5) + # TODO: clean up shared index validation + # with pytest.raises(ValueError, match="No axis named 5"): + # s.rename({}, axis=5) def test_set_name_attribute(self): s = Series([1, 2, 3]) @@ -233,7 +234,7 @@ def test_reorder_levels(self): def test_rename_axis_mapper(self): # GH 19978 mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) - s = Series([i for i in range(len(mi))], index=mi) + s = Series(list(range(len(mi))), index=mi) result = s.rename_axis(index={"ll": "foo"}) assert result.index.names == ["foo", "nn"] @@ -322,17 +323,6 @@ def test_set_axis_inplace(self): with pytest.raises(ValueError, match="No axis named"): s.set_axis(list("abcd"), axis=axis, inplace=False) - def test_set_axis_prior_to_deprecation_signature(self): - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") - - expected = s.copy() - expected.index = list("abcd") - - for axis in [0, "index"]: - with tm.assert_produces_warning(FutureWarning): - result = s.set_axis(0, list("abcd"), inplace=False) - tm.assert_series_equal(result, expected) - def test_reset_index_drop_errors(self): # GH 20925 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 457c976137c11..c29bd3ea0cb7d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1,559 +1,22 @@ -from itertools import product import operator import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p18 import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - Series, - date_range, - isna, - notna, -) -from pandas.api.types import is_scalar -from pandas.core.index import MultiIndex -from pandas.core.indexes.datetimes import Timestamp -from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.util.testing as tm +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm class TestSeriesAnalytics: - def test_describe(self): - s = Series([0, 1, 2, 3, 4], name="int_data") - result = s.describe() - expected = Series( - [5, 2, s.std(), 0, 1, 2, 3, 4], - name="int_data", - index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], - ) - tm.assert_series_equal(result, expected) - - s = Series([True, True, False, False, False], name="bool_data") - result = s.describe() - expected = Series( - [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] - ) - tm.assert_series_equal(result, expected) - - s = Series(["a", "a", "b", "c", "d"], name="str_data") - result = s.describe() - expected = Series( - [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] - ) - tm.assert_series_equal(result, expected) - - def test_describe_empty_object(self): - # https://github.com/pandas-dev/pandas/issues/27183 - s = pd.Series([None, None], dtype=object) - result = s.describe() - expected = pd.Series( - [0, 0, np.nan, np.nan], - dtype=object, - index=["count", "unique", "top", "freq"], - ) - tm.assert_series_equal(result, expected) - - result = s[:0].describe() - tm.assert_series_equal(result, expected) - # ensure NaN, not None - assert np.isnan(result.iloc[2]) - assert np.isnan(result.iloc[3]) - - def test_describe_with_tz(self, tz_naive_fixture): - # GH 21332 - tz = tz_naive_fixture - name = str(tz_naive_fixture) - start = Timestamp(2018, 1, 1) - end = Timestamp(2018, 1, 5) - s = Series(date_range(start, end, tz=tz), name=name) - result = s.describe() - expected = Series( - [ - 5, - 5, - s.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - ], - name=name, - index=["count", "unique", "top", "freq", "first", "last"], - ) - tm.assert_series_equal(result, expected) - - def test_argsort(self, datetime_series): - self._check_accum_op("argsort", datetime_series, check_dtype=False) - argsorted = datetime_series.argsort() - assert issubclass(argsorted.dtype.type, np.integer) - - # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) - assert s.dtype == "datetime64[ns]" - shifted = s.shift(-1) - assert shifted.dtype == "datetime64[ns]" - assert isna(shifted[4]) - - result = s.argsort() - expected = Series(range(5), dtype="int64") - tm.assert_series_equal(result, expected) - - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype="int64") - tm.assert_series_equal(result, expected) - - def test_argsort_stable(self): - s = Series(np.random.randint(0, 100, size=10000)) - mindexer = s.argsort(kind="mergesort") - qindexer = s.argsort() - - mexpected = np.argsort(s.values, kind="mergesort") - qexpected = np.argsort(s.values, kind="quicksort") - - tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) - msg = ( - r"ndarray Expected type ," - r" found instead" - ) - with pytest.raises(AssertionError, match=msg): - tm.assert_numpy_array_equal(qindexer, mindexer) - - def test_cumsum(self, datetime_series): - self._check_accum_op("cumsum", datetime_series) - - def test_cumprod(self, datetime_series): - self._check_accum_op("cumprod", datetime_series) - - def test_cummin(self, datetime_series): - tm.assert_numpy_array_equal( - datetime_series.cummin().values, - np.minimum.accumulate(np.array(datetime_series)), - ) - ts = datetime_series.copy() - ts[::2] = np.NaN - result = ts.cummin()[1::2] - expected = np.minimum.accumulate(ts.dropna()) - - tm.assert_series_equal(result, expected) - - def test_cummax(self, datetime_series): - tm.assert_numpy_array_equal( - datetime_series.cummax().values, - np.maximum.accumulate(np.array(datetime_series)), - ) - ts = datetime_series.copy() - ts[::2] = np.NaN - result = ts.cummax()[1::2] - expected = np.maximum.accumulate(ts.dropna()) - - tm.assert_series_equal(result, expected) - - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummin_datetime64(self): - s = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) - ) - - expected = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"]) - ) - result = s.cummin(skipna=True) - tm.assert_series_equal(expected, result) - - expected = pd.Series( - pd.to_datetime( - ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] - ) - ) - result = s.cummin(skipna=False) - tm.assert_series_equal(expected, result) - - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummax_datetime64(self): - s = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) - ) - - expected = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"]) - ) - result = s.cummax(skipna=True) - tm.assert_series_equal(expected, result) - - expected = pd.Series( - pd.to_datetime( - ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] - ) - ) - result = s.cummax(skipna=False) - tm.assert_series_equal(expected, result) - - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummin_timedelta64(self): - s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) - - expected = pd.Series( - pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"]) - ) - result = s.cummin(skipna=True) - tm.assert_series_equal(expected, result) - - expected = pd.Series( - pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"]) - ) - result = s.cummin(skipna=False) - tm.assert_series_equal(expected, result) - - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummax_timedelta64(self): - s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) - - expected = pd.Series( - pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"]) - ) - result = s.cummax(skipna=True) - tm.assert_series_equal(expected, result) - - expected = pd.Series( - pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"]) - ) - result = s.cummax(skipna=False) - tm.assert_series_equal(expected, result) - - def test_np_diff(self): - pytest.skip("skipping due to Series no longer being an ndarray") - - # no longer works as the return type of np.diff is now nd.array - s = Series(np.arange(5)) - - r = np.diff(s) - tm.assert_series_equal(Series([np.nan, 0, 0, 0, np.nan]), r) - - def test_int_diff(self): - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - result = s.diff() - assert result[1] == 1 - - def test_tz_diff(self): - # Combined datetime diff, normal diff and boolean diff test - ts = tm.makeTimeSeries(name="ts") - ts.diff() - - # neg n - result = ts.diff(-1) - expected = ts - ts.shift(-1) - tm.assert_series_equal(result, expected) - - # 0 - result = ts.diff(0) - expected = ts - ts - tm.assert_series_equal(result, expected) - - # datetime diff (GH3100) - s = Series(date_range("20130102", periods=5)) - result = s.diff() - expected = s - s.shift(1) - tm.assert_series_equal(result, expected) - - # timedelta diff - result = result - result.shift(1) # previous result - expected = expected.diff() # previously expected - tm.assert_series_equal(result, expected) - - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s.diff() - expected = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "input,output,diff", - [([False, True, True, False, False], [np.nan, True, False, True, False], 1)], - ) - def test_bool_diff(self, input, output, diff): - # boolean series (test for fixing #17294) - s = Series(input) - result = s.diff() - expected = Series(output) - tm.assert_series_equal(result, expected) - - def test_obj_diff(self): - # object series - s = Series([False, True, 5.0, np.nan, True, False]) - result = s.diff() - expected = s - s.shift(1) - tm.assert_series_equal(result, expected) - - def _check_accum_op(self, name, datetime_series_, check_dtype=True): - func = getattr(np, name) - tm.assert_numpy_array_equal( - func(datetime_series_).values, - func(np.array(datetime_series_)), - check_dtype=check_dtype, - ) - - # with missing values - ts = datetime_series_.copy() - ts[::2] = np.NaN - - result = func(ts)[1::2] - expected = func(np.array(ts.dropna())) - - tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) - - def test_compress(self): - cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") - expected = Series(s.values.compress(cond), index=list("ac"), name="foo") - with tm.assert_produces_warning(FutureWarning): - result = s.compress(cond) - tm.assert_series_equal(result, expected) - - def test_numpy_compress(self): - cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") - expected = Series(s.values.compress(cond), index=list("ac"), name="foo") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(np.compress(cond, s), expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.compress(cond, s, axis=1) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.compress(cond, s, out=s) - - def test_round(self, datetime_series): - datetime_series.index.name = "index_name" - result = datetime_series.round(2) - expected = Series( - np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" - ) - tm.assert_series_equal(result, expected) - assert result.name == datetime_series.name - - def test_numpy_round(self): - # See gh-12600 - s = Series([1.53, 1.36, 0.06]) - out = np.round(s, decimals=0) - expected = Series([2.0, 1.0, 0.0]) - tm.assert_series_equal(out, expected) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.round(s, decimals=0, out=s) - - def test_numpy_round_nan(self): - # See gh-14197 - s = Series([1.53, np.nan, 0.06]) - with tm.assert_produces_warning(None): - result = s.round() - expected = Series([2.0, np.nan, 0.0]) - tm.assert_series_equal(result, expected) - - def test_built_in_round(self): - s = Series([1.123, 2.123, 3.123], index=range(3)) - result = round(s) - expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3)) - tm.assert_series_equal(result, expected_rounded0) - - decimals = 2 - expected_rounded = Series([1.12, 2.12, 3.12], index=range(3)) - result = round(s, decimals) - tm.assert_series_equal(result, expected_rounded) - def test_prod_numpy16_bug(self): s = Series([1.0, 1.0, 1.0], index=range(3)) result = s.prod() assert not isinstance(result, Series) - @td.skip_if_no_scipy - def test_corr(self, datetime_series): - import scipy.stats as stats - - # full overlap - tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) - - # partial overlap - tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) - - assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) - - ts1 = datetime_series[:15].reindex(datetime_series.index) - ts2 = datetime_series[5:].reindex(datetime_series.index) - assert isna(ts1.corr(ts2, min_periods=12)) - - # No overlap - assert np.isnan(datetime_series[::2].corr(datetime_series[1::2])) - - # all NA - cp = datetime_series[:10].copy() - cp[:] = np.nan - assert isna(cp.corr(cp)) - - A = tm.makeTimeSeries() - B = tm.makeTimeSeries() - result = A.corr(B) - expected, _ = stats.pearsonr(A, B) - tm.assert_almost_equal(result, expected) - - @td.skip_if_no_scipy - def test_corr_rank(self): - import scipy.stats as stats - - # kendall and spearman - A = tm.makeTimeSeries() - B = tm.makeTimeSeries() - A[-5:] = A[:5] - result = A.corr(B, method="kendall") - expected = stats.kendalltau(A, B)[0] - tm.assert_almost_equal(result, expected) - - result = A.corr(B, method="spearman") - expected = stats.spearmanr(A, B)[0] - tm.assert_almost_equal(result, expected) - - # results from R - A = Series( - [ - -0.89926396, - 0.94209606, - -1.03289164, - -0.95445587, - 0.76910310, - -0.06430576, - -2.09704447, - 0.40660407, - -0.89926396, - 0.94209606, - ] - ) - B = Series( - [ - -1.01270225, - -0.62210117, - -1.56895827, - 0.59592943, - -0.01680292, - 1.17258718, - -1.06009347, - -0.10222060, - -0.89076239, - 0.89372375, - ] - ) - kexp = 0.4319297 - sexp = 0.5853767 - tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) - tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) - - def test_corr_invalid_method(self): - # GH PR #22298 - s1 = pd.Series(np.random.randn(10)) - s2 = pd.Series(np.random.randn(10)) - msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, " - with pytest.raises(ValueError, match=msg): - s1.corr(s2, method="____") - - def test_corr_callable_method(self, datetime_series): - # simple correlation example - # returns 1 if exact equality, 0 otherwise - my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 - - # simple example - s1 = Series([1, 2, 3, 4, 5]) - s2 = Series([5, 4, 3, 2, 1]) - expected = 0 - tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) - - # full overlap - tm.assert_almost_equal( - datetime_series.corr(datetime_series, method=my_corr), 1.0 - ) - - # partial overlap - tm.assert_almost_equal( - datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 - ) - - # No overlap - assert np.isnan( - datetime_series[::2].corr(datetime_series[1::2], method=my_corr) - ) - - # dataframe example - df = pd.DataFrame([s1, s2]) - expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) - tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) - - def test_cov(self, datetime_series): - # full overlap - tm.assert_almost_equal( - datetime_series.cov(datetime_series), datetime_series.std() ** 2 - ) - - # partial overlap - tm.assert_almost_equal( - datetime_series[:15].cov(datetime_series[5:]), - datetime_series[5:15].std() ** 2, - ) - - # No overlap - assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) - - # all NA - cp = datetime_series[:10].copy() - cp[:] = np.nan - assert isna(cp.cov(cp)) - - # min_periods - assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) - - ts1 = datetime_series[:15].reindex(datetime_series.index) - ts2 = datetime_series[5:].reindex(datetime_series.index) - assert isna(ts1.cov(ts2, min_periods=12)) - - def test_count(self, datetime_series): - assert datetime_series.count() == len(datetime_series) - - datetime_series[::2] = np.NaN - - assert datetime_series.count() == np.isfinite(datetime_series).sum() - - mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) - ts = Series(np.arange(len(mi)), index=mi) - - left = ts.count(level=1) - right = Series([2, 3, 1], index=[1, 2, np.nan]) - tm.assert_series_equal(left, right) - - ts.iloc[[0, 3, 5]] = np.nan - tm.assert_series_equal(ts.count(level=1), right - 1) - def test_dot(self): a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) b = DataFrame( @@ -652,255 +115,12 @@ def test_matmul(self): with pytest.raises(ValueError, match=msg): a.dot(b.T) - def test_clip(self, datetime_series): - val = datetime_series.median() - - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_lower(val).min() == val - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.clip_upper(val).max() == val - - assert datetime_series.clip(lower=val).min() == val - assert datetime_series.clip(upper=val).max() == val - - result = datetime_series.clip(-0.5, 0.5) - expected = np.clip(datetime_series, -0.5, 0.5) - tm.assert_series_equal(result, expected) - assert isinstance(expected, Series) - - def test_clip_types_and_nulls(self): - - sers = [ - Series([np.nan, 1.0, 2.0, 3.0]), - Series([None, "a", "b", "c"]), - Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), - ] - - for s in sers: - thresh = s[2] - with tm.assert_produces_warning(FutureWarning): - lower = s.clip_lower(thresh) - with tm.assert_produces_warning(FutureWarning): - upper = s.clip_upper(thresh) - assert lower[notna(lower)].min() == thresh - assert upper[notna(upper)].max() == thresh - assert list(isna(s)) == list(isna(lower)) - assert list(isna(s)) == list(isna(upper)) - - def test_clip_with_na_args(self): - """Should process np.nan argument as None """ - # GH # 17276 - s = Series([1, 2, 3]) - - tm.assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - tm.assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) - - # GH #19992 - tm.assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) - tm.assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) - - def test_clip_against_series(self): - # GH #6966 - - s = Series([1.0, 1.0, 4.0]) - threshold = Series([1.0, 2.0, 3.0]) - - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) - - lower = Series([1.0, 2.0, 3.0]) - upper = Series([1.5, 2.5, 3.5]) - - tm.assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) - tm.assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) - - @pytest.mark.parametrize("inplace", [True, False]) - @pytest.mark.parametrize("upper", [[1, 2, 3], np.asarray([1, 2, 3])]) - def test_clip_against_list_like(self, inplace, upper): - # GH #15390 - original = pd.Series([5, 6, 7]) - result = original.clip(upper=upper, inplace=inplace) - expected = pd.Series([1, 2, 3]) - - if inplace: - result = original - tm.assert_series_equal(result, expected, check_exact=True) - - def test_clip_with_datetimes(self): - - # GH 11838 - # naive and tz-aware datetimes - - t = Timestamp("2015-12-01 09:30:30") - s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) - result = s.clip(upper=t) - expected = Series( - [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] - ) - tm.assert_series_equal(result, expected) - - t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") - s = Series( - [ - Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), - Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), - ] - ) - result = s.clip(upper=t) - expected = Series( - [ - Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), - Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), - ] - ) - tm.assert_series_equal(result, expected) - - def test_cummethods_bool(self): - # GH 6270 - - a = pd.Series([False, False, False, True, True, False, False]) - b = ~a - c = pd.Series([False] * len(b)) - d = ~c - methods = { - "cumsum": np.cumsum, - "cumprod": np.cumprod, - "cummin": np.minimum.accumulate, - "cummax": np.maximum.accumulate, - } - args = product((a, b, c, d), methods) - for s, method in args: - expected = Series(methods[method](s.values)) - result = getattr(s, method)() - tm.assert_series_equal(result, expected) - - e = pd.Series([False, True, np.nan, False]) - cse = pd.Series([0, 1, np.nan, 1], dtype=object) - cpe = pd.Series([False, 0, np.nan, 0]) - cmin = pd.Series([False, False, np.nan, False]) - cmax = pd.Series([False, True, np.nan, True]) - expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} - - for method in methods: - res = getattr(e, method)() - tm.assert_series_equal(res, expecteds[method]) - - def test_isin(self): - s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - - result = s.isin(["A", "C"]) - expected = Series([True, False, True, False, False, False, True, True]) - tm.assert_series_equal(result, expected) - - # GH: 16012 - # This specific issue has to have a series over 1e6 in len, but the - # comparison array (in_list) must be large enough so that numpy doesn't - # do a manual masking trick that will avoid this issue altogether - s = Series(list("abcdefghijk" * 10 ** 5)) - # If numpy doesn't do the manual comparison/mask, these - # unorderable mixed types are what cause the exception in numpy - in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 - - assert s.isin(in_list).sum() == 200000 - - def test_isin_with_string_scalar(self): - # GH4763 - s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]" - ) - with pytest.raises(TypeError, match=msg): - s.isin("a") - - s = Series(["aaa", "b", "c"]) - with pytest.raises(TypeError, match=msg): - s.isin("aaa") - - def test_isin_with_i8(self): - # GH 5021 - - expected = Series([True, True, False, False, False]) - expected2 = Series([False, True, False, False, False]) - - # datetime64[ns] - s = Series(date_range("jan-01-2013", "jan-05-2013")) - - result = s.isin(s[0:2]) - tm.assert_series_equal(result, expected) - - result = s.isin(s[0:2].values) - tm.assert_series_equal(result, expected) - - # fails on dtype conversion in the first place - result = s.isin(s[0:2].values.astype("datetime64[D]")) - tm.assert_series_equal(result, expected) - - result = s.isin([s[1]]) - tm.assert_series_equal(result, expected2) - - result = s.isin([np.datetime64(s[1])]) - tm.assert_series_equal(result, expected2) - - result = s.isin(set(s[0:2])) - tm.assert_series_equal(result, expected) - - # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit="d")) - result = s.isin(s[0:2]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) - def test_isin_empty(self, empty): - # see gh-16991 - s = Series(["a", "b"]) - expected = Series([False, False]) - - result = s.isin(empty) - tm.assert_series_equal(expected, result) - def test_ptp(self): # GH21614 N = 1000 arr = np.random.randn(N) ser = Series(arr) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert np.ptp(ser) == np.ptp(arr) - - # GH11163 - s = Series([3, 5, np.nan, -3, 10]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert s.ptp() == 13 - assert pd.isna(s.ptp(skipna=False)) - - mi = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]]) - s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi) - - expected = pd.Series([6, 2], index=["a", "b"], dtype=np.float64) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.ptp(level=0), expected) - - expected = pd.Series([np.nan, np.nan], index=["a", "b"]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - - msg = "No axis named 1 for object type " - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp(axis=1) - - s = pd.Series(["a", "b", "c", "d", "e"]) - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp() - - msg = r"Series\.ptp does not implement numeric_only\." - with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp(numeric_only=True) + assert np.ptp(ser) == np.ptp(arr) def test_repeat(self): s = Series(np.random.randn(3), index=["a", "b", "c"]) @@ -923,54 +143,6 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(s, 2, axis=0) - def test_searchsorted(self): - s = Series([1, 2, 3]) - - result = s.searchsorted(1, side="left") - assert is_scalar(result) - assert result == 0 - - result = s.searchsorted(1, side="right") - assert is_scalar(result) - assert result == 1 - - def test_searchsorted_numeric_dtypes_scalar(self): - s = Series([1, 2, 90, 1000, 3e9]) - r = s.searchsorted(30) - assert is_scalar(r) - assert r == 2 - - r = s.searchsorted([30]) - e = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_searchsorted_numeric_dtypes_vector(self): - s = Series([1, 2, 90, 1000, 3e9]) - r = s.searchsorted([91, 2e6]) - e = np.array([3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_search_sorted_datetime64_scalar(self): - s = Series(pd.date_range("20120101", periods=10, freq="2D")) - v = pd.Timestamp("20120102") - r = s.searchsorted(v) - assert is_scalar(r) - assert r == 1 - - def test_search_sorted_datetime64_list(self): - s = Series(pd.date_range("20120101", periods=10, freq="2D")) - v = [pd.Timestamp("20120102"), pd.Timestamp("20120104")] - r = s.searchsorted(v) - e = np.array([1, 2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - - def test_searchsorted_sorter(self): - # GH8490 - s = Series([3, 1, 2]) - r = s.searchsorted([0, 3], sorter=np.argsort(s)) - e = np.array([0, 2], dtype=np.intp) - tm.assert_numpy_array_equal(r, e) - def test_is_monotonic(self): s = Series(np.random.randint(0, 10, size=1000)) @@ -988,65 +160,6 @@ def test_is_monotonic(self): assert s.is_monotonic is False assert s.is_monotonic_decreasing is True - def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) - s = Series([1, 2], mi) - backwards = s.iloc[[1, 0]] - - res = s.sort_index(level="A") - tm.assert_series_equal(backwards, res) - - res = s.sort_index(level=["A", "B"]) - tm.assert_series_equal(backwards, res) - - res = s.sort_index(level="A", sort_remaining=False) - tm.assert_series_equal(s, res) - - res = s.sort_index(level=["A", "B"], sort_remaining=False) - tm.assert_series_equal(s, res) - - def test_apply_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - s = pd.Series(values, name="XX", index=list("abcdefg")) - result = s.apply(lambda x: x.lower()) - - # should be categorical dtype when the number of categories are - # the same - values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) - exp = pd.Series(values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp.values) - - result = s.apply(lambda x: "A") - exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == np.object - - def test_shift_int(self, datetime_series): - ts = datetime_series.astype(int) - shifted = ts.shift(1) - expected = ts.astype(float).shift(1) - tm.assert_series_equal(shifted, expected) - - def test_shift_categorical(self): - # GH 9416 - s = pd.Series(["a", "b", "c", "d"], dtype="category") - - tm.assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) - - sp1 = s.shift(1) - tm.assert_index_equal(s.index, sp1.index) - assert np.all(sp1.values.codes[:1] == -1) - assert np.all(s.values.codes[:-1] == sp1.values.codes[1:]) - - sn2 = s.shift(-2) - tm.assert_index_equal(s.index, sn2.index) - assert np.all(sn2.values.codes[-2:] == -1) - assert np.all(s.values.codes[2:] == sn2.values.codes[:-2]) - - tm.assert_index_equal(s.values.categories, sp1.values.categories) - tm.assert_index_equal(s.values.categories, sn2.values.categories) - def test_unstack(self): index = MultiIndex( @@ -1106,117 +219,6 @@ def test_unstack(self): right.index = pd.MultiIndex.from_tuples(tpls) tm.assert_frame_equal(ts.unstack(level=0), right) - def test_value_counts_datetime(self): - # most dtypes are tested in test_base.py - values = [ - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 10:00"), - pd.Timestamp("2011-01-01 11:00"), - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 09:00"), - pd.Timestamp("2011-01-01 11:00"), - ] - - exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] - ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check DatetimeIndex outputs the same result - idx = pd.DatetimeIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_datetime_tz(self): - values = [ - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), - pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), - ] - - exp_idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], - tz="US/Eastern", - ) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - idx = pd.DatetimeIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_period(self): - values = [ - pd.Period("2011-01", freq="M"), - pd.Period("2011-02", freq="M"), - pd.Period("2011-03", freq="M"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-01", freq="M"), - pd.Period("2011-03", freq="M"), - ] - - exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check DatetimeIndex outputs the same result - idx = pd.PeriodIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_categorical_ordered(self): - # most dtypes are tested in test_base.py - values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) - - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - - def test_value_counts_categorical_not_ordered(self): - values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) - - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - - s = pd.Series(values, name="xxx") - tm.assert_series_equal(s.value_counts(), exp) - # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name="xxx") - tm.assert_series_equal(idx.value_counts(), exp) - - # normalize - exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") - tm.assert_series_equal(s.value_counts(normalize=True), exp) - tm.assert_series_equal(idx.value_counts(normalize=True), exp) - @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) @td.skip_if_np_lt("1.15") @@ -1266,381 +268,3 @@ def test_validate_stat_keepdims(self): ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) - - def test_compound_deprecated(self): - s = Series([0.1, 0.2, 0.3, 0.4]) - with tm.assert_produces_warning(FutureWarning): - s.compound() - - df = pd.DataFrame({"s": s}) - with tm.assert_produces_warning(FutureWarning): - df.compound() - - -main_dtypes = [ - "datetime", - "datetimetz", - "timedelta", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", -] - - -@pytest.fixture -def s_main_dtypes(): - """A DataFrame with many dtypes - - * datetime - * datetimetz - * timedelta - * [u]int{8,16,32,64} - * float{32,64} - - The columns are the name of the dtype. - """ - df = pd.DataFrame( - { - "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), - "datetimetz": pd.to_datetime( - ["2003", "2002", "2001", "2002", "2005"] - ).tz_localize("US/Eastern"), - "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), - } - ) - - for dtype in [ - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "uint8", - "uint16", - "uint32", - "uint64", - ]: - df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) - - return df - - -@pytest.fixture(params=main_dtypes) -def s_main_dtypes_split(request, s_main_dtypes): - """Each series in s_main_dtypes.""" - return s_main_dtypes[request.param] - - -def assert_check_nselect_boundary(vals, dtype, method): - # helper function for 'test_boundary_{dtype}' tests - s = Series(vals, dtype=dtype) - result = getattr(s, method)(3) - expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] - expected = s.loc[expected_idxr] - tm.assert_series_equal(result, expected) - - -class TestNLargestNSmallest: - @pytest.mark.parametrize( - "r", - [ - Series([3.0, 2, 1, 2, "5"], dtype="object"), - Series([3.0, 2, 1, 2, 5], dtype="object"), - # not supported on some archs - # Series([3., 2, 1, 2, 5], dtype='complex256'), - Series([3.0, 2, 1, 2, 5], dtype="complex128"), - Series(list("abcde")), - Series(list("abcde"), dtype="category"), - ], - ) - def test_error(self, r): - dt = r.dtype - msg = "Cannot use method 'n(larg|small)est' with dtype {dt}".format(dt=dt) - args = 2, len(r), 0, -1 - methods = r.nlargest, r.nsmallest - for method, arg in product(methods, args): - with pytest.raises(TypeError, match=msg): - method(arg) - - def test_nsmallest_nlargest(self, s_main_dtypes_split): - # float, int, datetime64 (use i8), timedelts64 (same), - # object that are numbers, object that are strings - s = s_main_dtypes_split - - tm.assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - tm.assert_series_equal(s.nsmallest(2, keep="last"), s.iloc[[2, 3]]) - - empty = s.iloc[0:0] - tm.assert_series_equal(s.nsmallest(0), empty) - tm.assert_series_equal(s.nsmallest(-1), empty) - tm.assert_series_equal(s.nlargest(0), empty) - tm.assert_series_equal(s.nlargest(-1), empty) - - tm.assert_series_equal(s.nsmallest(len(s)), s.sort_values()) - tm.assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) - tm.assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - tm.assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) - - def test_misc(self): - - s = Series([3.0, np.nan, 1, 2, 5]) - tm.assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) - tm.assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) - - msg = 'keep must be either "first", "last"' - with pytest.raises(ValueError, match=msg): - s.nsmallest(keep="invalid") - with pytest.raises(ValueError, match=msg): - s.nlargest(keep="invalid") - - # GH 15297 - s = Series([1] * 5, index=[1, 2, 3, 4, 5]) - expected_first = Series([1] * 3, index=[1, 2, 3]) - expected_last = Series([1] * 3, index=[5, 4, 3]) - - result = s.nsmallest(3) - tm.assert_series_equal(result, expected_first) - - result = s.nsmallest(3, keep="last") - tm.assert_series_equal(result, expected_last) - - result = s.nlargest(3) - tm.assert_series_equal(result, expected_first) - - result = s.nlargest(3, keep="last") - tm.assert_series_equal(result, expected_last) - - @pytest.mark.parametrize("n", range(1, 5)) - def test_n(self, n): - - # GH 13412 - s = Series([1, 4, 3, 2], index=[0, 0, 1, 1]) - result = s.nlargest(n) - expected = s.sort_values(ascending=False).head(n) - tm.assert_series_equal(result, expected) - - result = s.nsmallest(n) - expected = s.sort_values().head(n) - tm.assert_series_equal(result, expected) - - def test_boundary_integer(self, nselect_method, any_int_dtype): - # GH 21426 - dtype_info = np.iinfo(any_int_dtype) - min_val, max_val = dtype_info.min, dtype_info.max - vals = [min_val, min_val + 1, max_val - 1, max_val] - assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) - - def test_boundary_float(self, nselect_method, float_dtype): - # GH 21426 - dtype_info = np.finfo(float_dtype) - min_val, max_val = dtype_info.min, dtype_info.max - min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) - vals = [min_val, min_2nd, max_2nd, max_val] - assert_check_nselect_boundary(vals, float_dtype, nselect_method) - - @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) - def test_boundary_datetimelike(self, nselect_method, dtype): - # GH 21426 - # use int64 bounds and +1 to min_val since true minimum is NaT - # (include min_val/NaT at end to maintain same expected_idxr) - dtype_info = np.iinfo("int64") - min_val, max_val = dtype_info.min, dtype_info.max - vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] - assert_check_nselect_boundary(vals, dtype, nselect_method) - - def test_duplicate_keep_all_ties(self): - # see gh-16818 - s = Series([10, 9, 8, 7, 7, 7, 7, 6]) - result = s.nlargest(4, keep="all") - expected = Series([10, 9, 8, 7, 7, 7, 7]) - tm.assert_series_equal(result, expected) - - result = s.nsmallest(2, keep="all") - expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] - ) - def test_boolean(self, data, expected): - # GH 26154 : ensure True > False - s = Series(data) - result = s.nlargest(1) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - -class TestCategoricalSeriesAnalytics: - def test_count(self): - - s = Series( - Categorical( - [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True - ) - ) - result = s.count() - assert result == 2 - - def test_value_counts(self): - # GH 12835 - cats = Categorical(list("abcccb"), categories=list("cabd")) - s = Series(cats, name="xxx") - res = s.value_counts(sort=False) - - exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) - exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) - tm.assert_series_equal(res, exp) - - res = s.value_counts(sort=True) - - exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) - exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) - tm.assert_series_equal(res, exp) - - # check object dtype handles the Series.name as the same - # (tested in test_base.py) - s = Series(["a", "b", "c", "c", "c", "b"], name="xxx") - res = s.value_counts() - exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) - tm.assert_series_equal(res, exp) - - def test_value_counts_with_nan(self): - # see gh-9443 - - # sanity check - s = Series(["a", "b", "a"], dtype="category") - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) - - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - # same Series via two different constructions --> same behaviour - series = [ - Series(["a", "b", None, "a", None, None], dtype="category"), - Series( - Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) - ), - ] - - for s in series: - # None is a NaN value, so we exclude its count here - exp = Series([2, 1], index=CategoricalIndex(["a", "b"])) - res = s.value_counts(dropna=True) - tm.assert_series_equal(res, exp) - - # we don't exclude the count of None and sort by counts - exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"])) - res = s.value_counts(dropna=False) - tm.assert_series_equal(res, exp) - - # When we aren't sorting by counts, and np.nan isn't a - # category, it should be last. - exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan])) - res = s.value_counts(dropna=False, sort=False) - tm.assert_series_equal(res, exp) - - @pytest.mark.parametrize( - "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], - ) - def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): - cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) - - # Test case 1 - input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc1 values are seemingly-random - if not (np.array(tc1) == input1).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, True]) - tm.assert_series_equal(tc1.duplicated(), expected) - tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep="last"), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc1.duplicated(keep=False), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - # Test case 2 - input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc2 values are seemingly-random - if not (np.array(tc2) == input2).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(), expected) - tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep="last"), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(keep=False), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - def test_drop_duplicates_categorical_bool(self, ordered_fixture): - tc = Series( - Categorical( - [True, False, True, False], - categories=[True, False], - ordered=ordered_fixture, - ) - ) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc.duplicated(), expected) - tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, False, False]) - tm.assert_series_equal(tc.duplicated(keep="last"), expected) - tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, True, True]) - tm.assert_series_equal(tc.duplicated(keep=False), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 656bf5a0e8a44..f96d6ddfc357e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import ( Categorical, @@ -12,30 +14,20 @@ DatetimeIndex, Index, Series, + Timedelta, TimedeltaIndex, + Timestamp, date_range, period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray -from pandas.core.indexes.datetimes import Timestamp -import pandas.util.testing as tm import pandas.io.formats.printing as printing -class SharedWithSparse: - """ - A collection of tests Series and SparseSeries can share. - - In generic tests on this class, use ``self._assert_series_equal()`` - which is implemented in sub-classes. - """ - - def _assert_series_equal(self, left, right): - """Dispatch to series class dependent assertion""" - raise NotImplementedError - +class TestSeriesMisc: def test_scalarop_preserve_name(self, datetime_series): result = datetime_series * 2 assert result.name == datetime_series.name @@ -122,29 +114,25 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_argsort_preserve_name(self, datetime_series): - result = datetime_series.argsort() - assert result.name == datetime_series.name - def test_sort_index_name(self, datetime_series): result = datetime_series.sort_index(ascending=False) assert result.name == datetime_series.name def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} - result = self.series_klass(d) - expected = self.series_klass(d, index=sorted(d.keys())) - self._assert_series_equal(result, expected) + result = Series(d) + expected = Series(d, index=sorted(d.keys())) + tm.assert_series_equal(result, expected) - result = self.series_klass(d, index=["b", "c", "d", "a"]) - expected = self.series_klass([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) - self._assert_series_equal(result, expected) + result = Series(d, index=["b", "c", "d", "a"]) + expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) + tm.assert_series_equal(result, expected) - def test_constructor_subclass_dict(self): - data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) - series = self.series_klass(data) - expected = self.series_klass(dict(data.items())) - self._assert_series_equal(series, expected) + def test_constructor_subclass_dict(self, dict_subclass): + data = dict_subclass((x, 10.0 * x) for x in range(10)) + series = Series(data) + expected = Series(dict(data.items())) + tm.assert_series_equal(series, expected) def test_constructor_ordereddict(self): # GH3283 @@ -152,44 +140,44 @@ def test_constructor_ordereddict(self): ("col{i}".format(i=i), np.random.random()) for i in range(12) ) - series = self.series_klass(data) - expected = self.series_klass(list(data.values()), list(data.keys())) - self._assert_series_equal(series, expected) + series = Series(data) + expected = Series(list(data.values()), list(data.keys())) + tm.assert_series_equal(series, expected) # Test with subclass class A(OrderedDict): pass - series = self.series_klass(A(data)) - self._assert_series_equal(series, expected) + series = Series(A(data)) + tm.assert_series_equal(series, expected) def test_constructor_dict_multiindex(self): d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} _d = sorted(d.items()) - result = self.series_klass(d) - expected = self.series_klass( + result = Series(d) + expected = Series( [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) ) - self._assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) d["z"] = 111.0 _d.insert(0, ("z", d["z"])) - result = self.series_klass(d) - expected = self.series_klass( + result = Series(d) + expected = Series( [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) ) result = result.reindex(index=expected.index) - self._assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_constructor_dict_timedelta_index(self): # GH #12169 : Resample category data with timedelta index # construct Series from dict as data and TimedeltaIndex as index # will result NaN in result Series data - expected = self.series_klass( + expected = Series( data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") ) - result = self.series_klass( + result = Series( data={ pd.to_timedelta(0, unit="s"): "A", pd.to_timedelta(10, unit="s"): "B", @@ -197,25 +185,13 @@ def test_constructor_dict_timedelta_index(self): }, index=pd.to_timedelta([0, 10, 20], unit="s"), ) - self._assert_series_equal(result, expected) - - def test_from_array_deprecated(self): - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): - self.series_klass.from_array([1, 2, 3]) + tm.assert_series_equal(result, expected) def test_sparse_accessor_updates_on_inplace(self): s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") s.drop([0, 1], inplace=True) assert s.sparse.density == 1.0 - -class TestSeriesMisc(SharedWithSparse): - - series_klass = Series - # SharedWithSparse tests use generic, series_klass-agnostic assertion - _assert_series_equal = staticmethod(tm.assert_series_equal) - def test_tab_completion(self): # GH 9910 s = Series(list("abcd")) @@ -261,11 +237,11 @@ def test_tab_completion_with_categorical(self): def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(list("aabbcde")).astype("category") results = get_dir(s) - tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) + tm.assert_almost_equal(results, sorted(set(ok_for_cat))) @pytest.mark.parametrize( "index", @@ -289,7 +265,7 @@ def get_dir(s): ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. - s = pd.Series(index=index) + s = pd.Series(index=index, dtype=object) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: @@ -298,7 +274,7 @@ def test_index_tab_completion(self, index): assert x not in dir_s def test_not_hashable(self): - s_empty = Series() + s_empty = Series(dtype=object) s = Series([1]) msg = "'Series' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): @@ -336,7 +312,7 @@ def test_iteritems_strings(self, string_series): for idx, val in string_series.iteritems(): assert val == string_series[idx] - # assert is lazy (genrators don't define reverse, lists do) + # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.iteritems(), "reverse") def test_items_datetimes(self, datetime_series): @@ -347,7 +323,7 @@ def test_items_strings(self, string_series): for idx, val in string_series.items(): assert val == string_series[idx] - # assert is lazy (genrators don't define reverse, lists do) + # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") def test_raise_on_info(self): @@ -421,6 +397,50 @@ def test_numpy_unique(self, datetime_series): # it works! np.unique(datetime_series) + def test_item(self): + s = Series([1]) + result = s.item() + assert result == 1 + assert result == s.iloc[0] + assert isinstance(result, int) # i.e. not np.int64 + + ser = Series([0.5], index=[3]) + result = ser.item() + assert isinstance(result, float) + assert result == 0.5 + + ser = Series([1, 2]) + msg = "can only convert an array of size 1" + with pytest.raises(ValueError, match=msg): + ser.item() + + dti = pd.date_range("2016-01-01", periods=2) + with pytest.raises(ValueError, match=msg): + dti.item() + with pytest.raises(ValueError, match=msg): + Series(dti).item() + + val = dti[:1].item() + assert isinstance(val, Timestamp) + val = Series(dti)[:1].item() + assert isinstance(val, Timestamp) + + tdi = dti - dti + with pytest.raises(ValueError, match=msg): + tdi.item() + with pytest.raises(ValueError, match=msg): + Series(tdi).item() + + val = tdi[:1].item() + assert isinstance(val, Timedelta) + val = Series(tdi)[:1].item() + assert isinstance(val, Timedelta) + + # Case where ser[0] would not work + ser = Series(dti, index=[5, 6]) + val = ser[:1].item() + assert val == dti[0] + def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame @@ -437,13 +457,6 @@ def f(x): expected = tsdf.max() tm.assert_series_equal(result, expected) - # .item() - with tm.assert_produces_warning(FutureWarning): - s = Series([1]) - result = s.item() - assert result == 1 - assert s.item() == s.iloc[0] - # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) @@ -454,30 +467,6 @@ def f(x): s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) - # compress - # GH 6658 - s = Series([0, 1.0, -1], index=list("abc")) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.0], index=["b"])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s < -1, s) - # result empty Index(dtype=object) as the same as original - exp = Series([], dtype="float64", index=Index([], dtype="object")) - tm.assert_series_equal(result, exp) - - s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.0], index=[0.2])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s < -1, s) - # result empty Float64Index as the same as original - exp = Series([], dtype="float64", index=Index([], dtype="float64")) - tm.assert_series_equal(result, exp) - def test_str_accessor_updates_on_inplace(self): s = pd.Series(list("abc")) s.drop([0], inplace=True) @@ -497,19 +486,21 @@ def test_str_attribute(self): s.str.repeat(2) def test_empty_method(self): - s_empty = pd.Series() + s_empty = pd.Series(dtype=object) assert s_empty.empty - for full_series in [pd.Series([1]), pd.Series(index=[1])]: + s2 = pd.Series(index=[1], dtype=object) + for full_series in [pd.Series([1]), s2]: assert not full_series.empty - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) @@ -521,11 +512,12 @@ def test_integer_series_size(self): s = Series(range(9), dtype="Int64") assert s.size == 9 - def test_get_values_deprecation(self): - s = Series(range(9)) - with tm.assert_produces_warning(FutureWarning): - res = s.get_values() - tm.assert_numpy_array_equal(res, s.values) + def test_attrs(self): + s = pd.Series([0, 1], name="abc") + assert s.attrs == {} + s.attrs["version"] = 1 + result = s + 1 + assert result.attrs == {"version": 1} class TestCategoricalSeries: diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index e56294669a546..a4c55a80a9f0f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,4 +1,4 @@ -from collections import Counter, OrderedDict, defaultdict +from collections import Counter, defaultdict from itertools import chain import numpy as np @@ -6,8 +6,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params -import pandas.util.testing as tm +from pandas.core.base import SpecificationError class TestSeriesApply: @@ -36,7 +37,7 @@ def test_apply(self, datetime_series): assert s.name == rs.name # index but no data - s = Series(index=[1, 2, 3]) + s = Series(index=[1, 2, 3], dtype=np.float64) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) @@ -91,7 +92,7 @@ def test_apply_box(self): s = pd.Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) @@ -101,7 +102,7 @@ def test_apply_box(self): ] s = pd.Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) @@ -109,7 +110,7 @@ def test_apply_box(self): vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) assert s.dtype == "timedelta64[ns]" - res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) @@ -117,7 +118,7 @@ def test_apply_box(self): vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) assert s.dtype == "Period[M]" - res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) @@ -157,9 +158,27 @@ def test_apply_dict_depr(self): columns=["A", "B", "C"], index=pd.date_range("1/1/2000", periods=10), ) - with tm.assert_produces_warning(FutureWarning): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): tsdf.A.agg({"foo": ["sum", "mean"]}) + def test_apply_categorical(self): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = pd.Series(values, name="XX", index=list("abcdefg")) + result = ser.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = pd.Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == np.object + @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): # GH 20714 bug fixed in: GH 24275 @@ -170,6 +189,12 @@ def test_apply_categorical_with_nan_values(self, series): expected = expected.astype(object) tm.assert_series_equal(result, expected) + def test_apply_empty_integer_series_with_datetime_index(self): + # GH 21245 + s = pd.Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int) + result = s.apply(lambda x: x) + tm.assert_series_equal(result, s) + class TestSeriesAggregate: def test_transform(self, string_series): @@ -250,31 +275,17 @@ def test_demo(self): tm.assert_series_equal(result, expected) # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"]}) - - expected = ( - DataFrame({"foo": [0, 5]}, index=["min", "max"]).unstack().rename("series") - ) - tm.assert_series_equal(result, expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"]}) def test_multiple_aggregators_with_dict_api(self): s = Series(range(6), dtype="int64", name="series") # nested renaming - with tm.assert_produces_warning(FutureWarning): - result = s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) - - expected = ( - DataFrame( - {"foo": [5.0, np.nan, 0.0, np.nan], "bar": [np.nan, 2.5, np.nan, 15.0]}, - columns=["foo", "bar"], - index=["max", "mean", "min", "sum"], - ) - .unstack() - .rename("series") - ) - tm.assert_series_equal(result.reindex_like(expected), expected) + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): # test that we are evaluating row-by-row first @@ -303,18 +314,16 @@ def test_replicate_describe(self, string_series): # this also tests a result set that is all scalars expected = string_series.describe() result = string_series.apply( - OrderedDict( - [ - ("count", "count"), - ("mean", "mean"), - ("std", "std"), - ("min", "min"), - ("25%", lambda x: x.quantile(0.25)), - ("50%", "median"), - ("75%", lambda x: x.quantile(0.75)), - ("max", "max"), - ] - ) + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } ) tm.assert_series_equal(result, expected) @@ -339,14 +348,14 @@ def test_non_callable_aggregates(self): # test when mixed w/ callable reducers result = s.agg(["size", "count", "mean"]) - expected = Series(OrderedDict([("size", 3.0), ("count", 2.0), ("mean", 1.5)])) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) tm.assert_series_equal(result[expected.index], expected) @pytest.mark.parametrize( "series, func, expected", chain( _get_cython_table_params( - Series(), + Series(dtype=np.float64), [ ("sum", 0), ("max", np.nan), @@ -401,8 +410,11 @@ def test_agg_cython_table(self, series, func, expected): "series, func, expected", chain( _get_cython_table_params( - Series(), - [("cumprod", Series([], Index([]))), ("cumsum", Series([], Index([])))], + Series(dtype=np.float64), + [ + ("cumprod", Series([], Index([]), dtype=np.float64)), + ("cumsum", Series([], Index([]), dtype=np.float64)), + ], ), _get_cython_table_params( Series([np.nan, 1, 2, 3]), @@ -581,6 +593,14 @@ def test_map_defaultdict(self): expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + def test_map_dict_na_key(self): + # https://github.com/pandas-dev/pandas/issues/17648 + # Checks that np.nan key is appropriately mapped + s = Series([1, 2, np.nan]) + expected = Series(["a", "b", "c"]) + result = s.map({1: "a", 2: "b", np.nan: "c"}) + tm.assert_series_equal(result, expected) + def test_map_dict_subclass_with_missing(self): """ Test Series.map with a dictionary subclass that defines __missing__, @@ -607,12 +627,36 @@ class DictWithoutMissing(dict): expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) + def test_map_abc_mapping(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_mapping_dict_subclass): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) @@ -622,7 +666,7 @@ def test_map_box(self): ] s = pd.Series(vals) assert s.dtype == "datetime64[ns, US/Eastern]" - res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}") exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) @@ -630,7 +674,7 @@ def test_map_box(self): vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) assert s.dtype == "timedelta64[ns]" - res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.days}") exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) @@ -638,7 +682,7 @@ def test_map_box(self): vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) assert s.dtype == "Period[M]" - res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + res = s.apply(lambda x: f"{type(x).__name__}_{x.freqstr}") exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) @@ -736,3 +780,10 @@ def test_apply_scaler_on_date_time_index_aware_series(self): series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + + def test_map_float_to_string_precision(self): + # GH 13228 + ser = pd.Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 68d6169fa4f34..f3ffdc373e178 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + import pandas as pd from pandas import Series -from pandas.core.indexes.period import IncompatibleFrequency -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): @@ -46,6 +47,22 @@ def test_flex_method_equivalence(self, opname, ts): expected = alt(other, series) tm.assert_almost_equal(result, expected) + def test_flex_method_subclass_metadata_preservation(self, all_arithmetic_operators): + # GH 13208 + class MySeries(Series): + _metadata = ["x"] + + @property + def _constructor(self): + return MySeries + + opname = all_arithmetic_operators + op = getattr(Series, opname) + m = MySeries([1, 2, 3], name="test") + m.x = 42 + result = op(m, 1) + assert result.x == 42 + class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted @@ -171,6 +188,14 @@ def test_ser_cmp_result_names(self, names, op): result = op(ser, tdi) assert result.name == names[2] + # interval dtype + if op in [operator.eq, operator.ne]: + # interval dtype comparisons not yet implemented + ii = pd.interval_range(start=0, periods=5, name=names[0]) + ser = Series(ii).rename(names[1]) + result = op(ser, ii) + assert result.name == names[2] + # categorical if op in [operator.eq, operator.ne]: # categorical dtype comparisons raise for inequalities diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index b2ecd7c4997f1..239353d3955b4 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -4,65 +4,11 @@ import pytest import pandas as pd -from pandas import DataFrame, DatetimeIndex, Series, date_range -import pandas.util.testing as tm +from pandas import DataFrame, Series +import pandas._testing as tm class TestSeriesCombine: - def test_append(self, datetime_series, string_series, object_series): - appendedSeries = string_series.append(object_series) - for idx, value in appendedSeries.items(): - if idx in string_series.index: - assert value == string_series[idx] - elif idx in object_series.index: - assert value == object_series[idx] - else: - raise AssertionError("orphaned index!") - - msg = "Indexes have overlapping values:" - with pytest.raises(ValueError, match=msg): - datetime_series.append(datetime_series, verify_integrity=True) - - def test_append_many(self, datetime_series): - pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] - - result = pieces[0].append(pieces[1:]) - tm.assert_series_equal(result, datetime_series) - - def test_append_duplicates(self): - # GH 13677 - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) - tm.assert_series_equal(s1.append(s2), exp) - tm.assert_series_equal(pd.concat([s1, s2]), exp) - - # the result must have RangeIndex - exp = pd.Series([1, 2, 3, 4, 5, 6]) - tm.assert_series_equal( - s1.append(s2, ignore_index=True), exp, check_index_type=True - ) - tm.assert_series_equal( - pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True - ) - - msg = "Indexes have overlapping values:" - with pytest.raises(ValueError, match=msg): - s1.append(s2, verify_integrity=True) - with pytest.raises(ValueError, match=msg): - pd.concat([s1, s2], verify_integrity=True) - - def test_append_tuples(self): - # GH 28410 - s = pd.Series([1, 2, 3]) - list_input = [s, s] - tuple_input = (s, s) - - expected = s.append(list_input) - result = s.append(tuple_input) - - tm.assert_series_equal(expected, result) - def test_combine_scalar(self): # GH 21248 # Note - combine() with another Series is tested elsewhere because @@ -107,7 +53,8 @@ def test_combine_first(self): # corner case s = Series([1.0, 2, 3], index=[0, 1, 2]) - result = s.combine_first(Series([], index=[])) + empty = Series([], index=[], dtype=object) + result = s.combine_first(empty) s.index = s.index.astype("O") tm.assert_series_equal(s, result) @@ -290,10 +237,6 @@ def test_concat_empty_series_dtypes(self): ) assert result.dtype == "Sparse[float64]" - # GH 26705 - Assert .ftype is deprecated - with tm.assert_produces_warning(FutureWarning): - assert result.ftype == "float64:sparse" - result = pd.concat( [Series(dtype="float64").astype("Sparse"), Series(dtype="float64")] ) @@ -301,10 +244,6 @@ def test_concat_empty_series_dtypes(self): expected = pd.SparseDtype(np.float64) assert result.dtype == expected - # GH 26705 - Assert .ftype is deprecated - with tm.assert_produces_warning(FutureWarning): - assert result.ftype == "float64:sparse" - result = pd.concat( [Series(dtype="float64").astype("Sparse"), Series(dtype="object")] ) @@ -312,10 +251,6 @@ def test_concat_empty_series_dtypes(self): expected = pd.SparseDtype("object") assert result.dtype == expected - # GH 26705 - Assert .ftype is deprecated - with tm.assert_produces_warning(FutureWarning): - assert result.ftype == "object:sparse" - def test_combine_first_dt64(self): from pandas.core.tools.datetimes import to_datetime @@ -330,99 +265,3 @@ def test_combine_first_dt64(self): rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), "2011"]) tm.assert_series_equal(rs, xp) - - -class TestTimeseries: - def test_append_concat(self): - rng = date_range("5/8/2012 1:45", periods=10, freq="5T") - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - - result = ts.append(ts) - result_df = df.append(df) - ex_index = DatetimeIndex(np.tile(rng.values, 2)) - tm.assert_index_equal(result.index, ex_index) - tm.assert_index_equal(result_df.index, ex_index) - - appended = rng.append(rng) - tm.assert_index_equal(appended, ex_index) - - appended = rng.append([rng, rng]) - ex_index = DatetimeIndex(np.tile(rng.values, 3)) - tm.assert_index_equal(appended, ex_index) - - # different index names - rng1 = rng.copy() - rng2 = rng.copy() - rng1.name = "foo" - rng2.name = "bar" - assert rng1.append(rng1).name == "foo" - assert rng1.append(rng2).name is None - - def test_append_concat_tz(self): - # see gh-2938 - rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") - rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") - rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_explicit_pytz(self): - # see gh-2938 - from pytz import timezone as timezone - - rng = date_range( - "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") - ) - rng2 = date_range( - "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") - ) - rng3 = date_range( - "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") - ) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_dateutil(self): - # see gh-2938 - rng = date_range( - "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" - ) - rng2 = date_range( - "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" - ) - rng3 = date_range( - "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" - ) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index fb2a8dde96e2b..c38e5708be09b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -7,10 +7,9 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import PY36 from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -27,8 +26,8 @@ period_range, timedelta_range, ) -from pandas.core.arrays import period_array -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import IntervalArray, period_array class TestSeriesConstructors: @@ -53,8 +52,10 @@ class TestSeriesConstructors: ], ) def test_empty_constructor(self, constructor, check_index_type): - expected = Series() - result = constructor() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + expected = Series() + result = constructor() + assert len(result.index) == 0 tm.assert_series_equal(result, expected, check_index_type=check_index_type) @@ -66,6 +67,14 @@ def test_invalid_dtype(self): with pytest.raises(TypeError, match=msg): Series([], name="time", dtype=dtype) + def test_invalid_compound_dtype(self): + # GH#13296 + c_dtype = np.dtype([("a", "i8"), ("b", "f4")]) + cdt_arr = np.array([(1, 0.4), (256, -13)], dtype=c_dtype) + + with pytest.raises(ValueError, match="Use DataFrame instead"): + Series(cdt_arr, index=["A", "B"]) + def test_scalar_conversion(self): # Pass in scalar is disabled @@ -77,8 +86,8 @@ def test_scalar_conversion(self): assert int(Series([1.0])) == 1 def test_constructor(self, datetime_series): - empty_series = Series() - + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty_series = Series() assert datetime_series.index.is_all_dates # Pass in Series @@ -95,7 +104,8 @@ def test_constructor(self, datetime_series): assert mixed[1] is np.NaN assert not empty_series.index.is_all_dates - assert not Series().index.is_all_dates + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + assert not Series().index.is_all_dates # exception raised is of type Exception with pytest.raises(Exception, match="Data must be 1-dimensional"): @@ -114,8 +124,9 @@ def test_constructor(self, datetime_series): @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): - empty = Series() - empty2 = Series(input_class()) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty = Series() + empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals @@ -133,8 +144,9 @@ def test_constructor_empty(self, input_class): if input_class is not list: # With index: - empty = Series(index=range(10)) - empty2 = Series(input_class(), index=range(10)) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + empty = Series(index=range(10)) + empty2 = Series(input_class(), index=range(10)) tm.assert_series_equal(empty, empty2) # With index and dtype float64: @@ -166,7 +178,8 @@ def test_constructor_dtype_only(self, dtype, index): assert len(result) == 0 def test_constructor_no_data_index_order(self): - result = pd.Series(index=["b", "a", "c"]) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + result = pd.Series(index=["b", "a", "c"]) assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): @@ -396,22 +409,6 @@ def test_constructor_categorical_string(self): result = Series(result, dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_categorical_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - - cat = Categorical(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning, check_stacklevel=False): - Series(cat, dtype=cdt2) - - s = Series(cat) - with tm.assert_produces_warning(warning, check_stacklevel=False): - Series(s, dtype=cdt2) - def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the @@ -632,7 +629,8 @@ def test_constructor_limit_copies(self, index): assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): - s = Series(None, index=range(5)) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + s = Series(None, index=range(5)) assert s.dtype == np.float64 s = Series(None, index=range(5), dtype=object) @@ -640,8 +638,9 @@ def test_constructor_pass_none(self): # GH 7431 # inference on the index - s = Series(index=np.array([None])) - expected = Series(index=Index([None])) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + s = Series(index=np.array([None])) + expected = Series(index=Index([None])) tm.assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): @@ -782,7 +781,7 @@ def test_constructor_dtype_datetime64(self): dts.astype("int64") # invalid casting - msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): dts.astype("int32") @@ -968,16 +967,34 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") tm.assert_series_equal(result, expected) - def test_construction_interval(self): + @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray]) + def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals - index = IntervalIndex.from_breaks(np.arange(3), closed="right") - result = Series(index) - repr(result) - str(result) - tm.assert_index_equal(Index(result.values), index) + intervals = interval_constructor.from_breaks(np.arange(3), closed="right") + result = Series(intervals) + assert result.dtype == "interval[int64]" + tm.assert_index_equal(Index(result.values), Index(intervals)) - result = Series(index.values) - tm.assert_index_equal(Index(result.values), index) + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_interval(self, data_constructor): + # GH 23563: consistent closed results in interval dtype + data = [pd.Interval(0, 1), pd.Interval(0, 2), None] + result = pd.Series(data_constructor(data)) + expected = pd.Series(IntervalArray(data)) + assert result.dtype == "interval[float64]" + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_interval_mixed_closed(self, data_constructor): + # GH 23563: mixed closed results in object dtype (not interval dtype) + data = [pd.Interval(0, 1, closed="both"), pd.Interval(0, 2, closed="neither")] + result = Series(data_constructor(data)) + assert result.dtype == object + assert result.tolist() == data def test_construction_consistency(self): @@ -994,17 +1011,16 @@ def test_construction_consistency(self): result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) - def test_constructor_infer_period(self): + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_period(self, data_constructor): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] - result = pd.Series(data) + result = pd.Series(data_constructor(data)) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" - data = np.asarray(data, dtype=object) - tm.assert_series_equal(result, expected) - assert result.dtype == "Period[D]" - def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) @@ -1030,21 +1046,25 @@ def test_constructor_dict(self): pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) - expected = Series(np.nan, pidx) + expected = Series(np.nan, pidx, dtype=np.float64) expected.iloc[0] = 0 expected.iloc[1] = 1 tm.assert_series_equal(result, expected) + def test_constructor_dict_list_value_explicit_dtype(self): + # GH 18625 + d = {"a": [[2], [3], [4]]} + result = Series(d, index=["a"], dtype="object") + expected = Series(d, index=["a"]) + tm.assert_series_equal(result, expected) + def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {"b": 1, "a": 0, "c": 2} result = Series(d) - if PY36: - expected = Series([1, 0, 2], index=list("bac")) - else: - expected = Series([0, 1, 2], index=list("abc")) + expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) @@ -1086,6 +1106,14 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) + def test_constructor_mapping(self, non_mapping_dict_subclass): + # GH 29788 + ndm = non_mapping_dict_subclass({3: "three"}) + result = Series(ndm) + expected = Series(["three"], index=[3]) + + tm.assert_series_equal(result, expected) + def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) @@ -1132,7 +1160,7 @@ def test_fromDict(self): def test_fromValue(self, datetime_series): - nans = Series(np.NaN, index=datetime_series.index) + nans = Series(np.NaN, index=datetime_series.index, dtype=np.float64) assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) @@ -1203,7 +1231,7 @@ def test_constructor_dtype_timedelta64(self): td.astype("int64") # invalid casting - msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py new file mode 100644 index 0000000000000..885b5bf0476f2 --- /dev/null +++ b/pandas/tests/series/test_cumulative.py @@ -0,0 +1,170 @@ +""" +Tests for Series cumulative operations. + +See also +-------- +tests.frame.test_cumulative +""" +from itertools import product + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def _check_accum_op(name, series, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(series).values, func(np.array(series)), check_dtype=check_dtype, + ) + + # with missing values + ts = series.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + +class TestSeriesCumulativeOps: + def test_cumsum(self, datetime_series): + _check_accum_op("cumsum", datetime_series) + + def test_cumprod(self, datetime_series): + _check_accum_op("cumprod", datetime_series) + + def test_cummin(self, datetime_series): + tm.assert_numpy_array_equal( + datetime_series.cummin().values, + np.minimum.accumulate(np.array(datetime_series)), + ) + ts = datetime_series.copy() + ts[::2] = np.NaN + result = ts.cummin()[1::2] + expected = np.minimum.accumulate(ts.dropna()) + + tm.assert_series_equal(result, expected) + + def test_cummax(self, datetime_series): + tm.assert_numpy_array_equal( + datetime_series.cummax().values, + np.maximum.accumulate(np.array(datetime_series)), + ) + ts = datetime_series.copy() + ts[::2] = np.NaN + result = ts.cummax()[1::2] + expected = np.maximum.accumulate(ts.dropna()) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummin_datetime64(self, tz): + s = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"] + ).tz_localize(tz) + ) + result = s.cummin(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] + ).tz_localize(tz) + ) + result = s.cummin(skipna=False) + tm.assert_series_equal(expected, result) + + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummax_datetime64(self, tz): + s = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"] + ).tz_localize(tz) + ) + result = s.cummax(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] + ).tz_localize(tz) + ) + result = s.cummax(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummin_timedelta64(self): + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"]) + ) + result = s.cummin(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"]) + ) + result = s.cummin(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummax_timedelta64(self): + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"]) + ) + result = s.cummax(skipna=True) + tm.assert_series_equal(expected, result) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"]) + ) + result = s.cummax(skipna=False) + tm.assert_series_equal(expected, result) + + def test_cummethods_bool(self): + # GH#6270 + + a = pd.Series([False, False, False, True, True, False, False]) + b = ~a + c = pd.Series([False] * len(b)) + d = ~c + methods = { + "cumsum": np.cumsum, + "cumprod": np.cumprod, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + } + args = product((a, b, c, d), methods) + for s, method in args: + expected = pd.Series(methods[method](s.values)) + result = getattr(s, method)() + tm.assert_series_equal(result, expected) + + e = pd.Series([False, True, np.nan, False]) + cse = pd.Series([0, 1, np.nan, 1], dtype=object) + cpe = pd.Series([False, 0, np.nan, 0]) + cmin = pd.Series([False, False, np.nan, False]) + cmax = pd.Series([False, True, np.nan, True]) + expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} + + for method in methods: + res = getattr(e, method)() + tm.assert_series_equal(res, expecteds[method]) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 1346f2fd57f10..b8be4ea137e3d 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -24,9 +24,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray import pandas.core.common as com -import pandas.util.testing as tm class TestSeriesDatetimeValues: @@ -208,20 +208,18 @@ def compare(s, name): # test limited display api def get_dir(s): results = [r for r in s.dt.__dir__() if not r.startswith("_")] - return list(sorted(set(results))) + return sorted(set(results)) s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) s = Series( period_range("20130101", periods=5, freq="D", name="xxx").astype(object) ) results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_period + ok_for_period_methods))) + results, sorted(set(ok_for_period + ok_for_period_methods)) ) # 11295 @@ -229,9 +227,7 @@ def get_dir(s): s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) - tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) - ) + tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) exp_values = pd.date_range( "2015-01-01", "2016-01-01", freq="T", tz="UTC" ).tz_convert("America/Chicago") @@ -344,6 +340,39 @@ def test_dt_namespace_accessor_categorical(self): expected = Series([2017, 2017, 2018, 2018], name="foo") tm.assert_series_equal(result, expected) + def test_dt_tz_localize_categorical(self, tz_aware_fixture): + # GH 27952 + tz = tz_aware_fixture + datetimes = pd.Series( + ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns]" + ) + categorical = datetimes.astype("category") + result = categorical.dt.tz_localize(tz) + expected = datetimes.dt.tz_localize(tz) + tm.assert_series_equal(result, expected) + + def test_dt_tz_convert_categorical(self, tz_aware_fixture): + # GH 27952 + tz = tz_aware_fixture + datetimes = pd.Series( + ["2019-01-01", "2019-01-01", "2019-01-02"], dtype="datetime64[ns, MET]" + ) + categorical = datetimes.astype("category") + result = categorical.dt.tz_convert(tz) + expected = datetimes.dt.tz_convert(tz) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("accessor", ["year", "month", "day"]) + def test_dt_other_accessors_categorical(self, accessor): + # GH 27952 + datetimes = pd.Series( + ["2018-01-01", "2018-01-01", "2019-01-02"], dtype="datetime64[ns]" + ) + categorical = datetimes.astype("category") + result = getattr(categorical.dt, accessor) + expected = getattr(datetimes.dt, accessor) + tm.assert_series_equal(result, expected) + def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 s = Series(date_range("20130101", periods=5, freq="D")) @@ -398,7 +427,6 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): ] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() - assert s.dt.weekday_name[day] == eng_name assert s.dt.day_name(locale=time_locale)[day] == name s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) @@ -471,7 +499,7 @@ def test_strftime(self): s.iloc[0] = pd.NaT result = s.dt.strftime("%Y/%m/%d") expected = Series( - ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) @@ -521,6 +549,20 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data", + [ + DatetimeIndex(["2019-01-01", pd.NaT]), + PeriodIndex(["2019-01-01", pd.NaT], dtype="period[D]"), + ], + ) + def test_strftime_nat(self, data): + # GH 29578 + s = Series(data) + result = s.dt.strftime("%Y-%m-%d") + expected = Series(["2019-01-01", np.nan]) + tm.assert_series_equal(result, expected) + def test_valid_dt_with_missing_values(self): from datetime import date, time diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 6ee120f3bec64..a57ec2ba05d54 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,7 +8,7 @@ from pandas._libs.tslibs import iNaT -from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel +from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd from pandas import ( @@ -20,7 +20,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDtypes: @@ -44,30 +44,11 @@ def test_astype(self, dtype): assert as_typed.dtype == dtype assert as_typed.name == s.name - def test_asobject_deprecated(self): - s = Series(np.random.randn(5), name="foo") - with tm.assert_produces_warning(FutureWarning): - o = s.asobject - assert isinstance(o, np.ndarray) - def test_dtype(self, datetime_series): assert datetime_series.dtype == np.dtype("float64") assert datetime_series.dtypes == np.dtype("float64") - # GH 26705 - Assert .ftype is deprecated - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftype == "float64:dense" - - # GH 26705 - Assert .ftypes is deprecated - with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftypes == "float64:dense" - # GH18243 - Assert .get_ftype_counts is deprecated - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal( - datetime_series.get_ftype_counts(), Series(1, ["float64:dense"]) - ) - @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) def test_astype_cast_nan_inf_int(self, dtype, value): @@ -224,7 +205,11 @@ def test_astype_dict_like(self, dtype_class): # GH16717 # if dtypes provided is empty, it should error - dt5 = dtype_class({}) + if dtype_class is Series: + dt5 = dtype_class({}, dtype=object) + else: + dt5 = dtype_class({}) + with pytest.raises(KeyError, match=msg): s.astype(dt5) @@ -234,17 +219,6 @@ def test_astype_categories_raises(self): with pytest.raises(TypeError, match="got an unexpected"): s.astype("category", categories=["a", "b"], ordered=True) - @pytest.mark.parametrize( - "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] - ) - def test_astype_category_ordered_none_deprecated(self, none, warning): - # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) - cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - s = Series(list("abcdaba"), dtype=cdt1) - with tm.assert_produces_warning(warning, check_stacklevel=False): - s.astype(cdt2) - def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) @@ -299,7 +273,7 @@ def test_astype_categorical_to_other(self): expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = r"could not convert string to float|" r"invalid literal for float\(\)" + msg = r"could not convert string to float|invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): s.astype("float64") @@ -377,6 +351,15 @@ def test_astype_categorical_to_categorical( result = s.astype("category") tm.assert_series_equal(result, expected) + def test_astype_bool_missing_to_categorical(self): + # GH-19182 + s = Series([True, False, np.nan]) + assert s.dtypes == np.object_ + + result = s.astype(CategoricalDtype(categories=[True, False])) + expected = Series(Categorical([True, False, np.nan], categories=[True, False])) + tm.assert_series_equal(result, expected) + def test_astype_categoricaldtype(self): s = Series(["a", "b", "a"]) result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) @@ -418,29 +401,10 @@ def test_astype_empty_constructor_equality(self, dtype): "m", # Generic timestamps raise a ValueError. Already tested. ): init_empty = Series([], dtype=dtype) - as_type_empty = Series([]).astype(dtype) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_complex(self): - # see gh-4819: complex access for ndarray compat - a = np.arange(5, dtype=np.float64) - b = Series(a + 4j * a) - - tm.assert_numpy_array_equal(a, np.real(b)) - tm.assert_numpy_array_equal(4 * a, np.imag(b)) - - b.real = np.arange(5) + 5 - tm.assert_numpy_array_equal(a + 5, np.real(b)) - tm.assert_numpy_array_equal(4 * a, np.imag(b)) - - def test_real_imag_deprecated(self): - # GH 18262 - s = pd.Series([1]) - with tm.assert_produces_warning(FutureWarning): - s.imag - s.real - def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) @@ -501,11 +465,6 @@ def test_infer_objects_series(self): assert actual.dtype == "object" tm.assert_series_equal(actual, expected) - def test_is_homogeneous_type(self): - assert Series()._is_homogeneous_type - assert Series([1, 2])._is_homogeneous_type - assert Series(pd.Categorical([1, 2]))._is_homogeneous_type - @pytest.mark.parametrize( "data", [ @@ -518,3 +477,13 @@ def test_values_compatibility(self, data): result = pd.Series(data).values expected = np.array(data.astype(object)) tm.assert_numpy_array_equal(result, expected) + + def test_reindex_astype_order_consistency(self): + # GH 17444 + s = Series([1, 2, 3], index=[2, 0, 1]) + new_index = [0, 1, 2] + temp_dtype = "category" + new_dtype = str + s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype) + s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) + tm.assert_series_equal(s1, s2) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 4a914e4fb0f2c..3513db6177951 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -2,10 +2,11 @@ import pytest from pandas import Categorical, Series -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.construction import create_series_with_explicit_dtype -def test_value_counts_nunique(): +def test_nunique(): # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan @@ -70,7 +71,7 @@ def test_unique_data_ownership(): ) def test_is_unique(data, expected): # GH11946 / GH25180 - s = Series(data) + s = create_series_with_explicit_dtype(data, dtype_if_empty=object) assert s.is_unique is expected @@ -85,76 +86,7 @@ def __ne__(self, other): with capsys.disabled(): li = [Foo(i) for i in range(5)] - s = Series(li, index=[i for i in range(5)]) + s = Series(li, index=list(range(5))) s.is_unique captured = capsys.readouterr() assert len(captured.err) == 0 - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, False, False, True, True, False])), - ("last", Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])), - ], -) -def test_drop_duplicates(any_numpy_dtype, keep, expected): - tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) - - if tc.dtype == "bool": - pytest.skip("tested separately in test_drop_duplicates_bool") - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, True])), - ("last", Series([True, True, False, False])), - (False, Series([True, True, True, True])), - ], -) -def test_drop_duplicates_bool(keep, expected): - tc = Series([True, False, True, False]) - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True], name="name")), - ("last", Series([True, True, False, False, False], name="name")), - (False, Series([True, True, True, False, True], name="name")), - ], -) -def test_duplicated_keep(keep, expected): - s = Series(["a", "b", "b", "c", "a"], name="name") - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - s = Series([np.nan, 3, 3, None, np.nan], dtype=object) - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 187c5d90407ce..4c817ed2e2d59 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm class TestSeriesInternals: @@ -242,10 +242,3 @@ def test_hasnans_unchached_for_series(): ser.iloc[-1] = np.nan assert ser.hasnans is True assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ - - -def test_put_deprecated(): - # GH 18262 - s = pd.Series([1]) - with tm.assert_produces_warning(FutureWarning): - s.put(0, 0) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f954e6fb4bf98..510c11a51ca38 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -1,4 +1,3 @@ -import collections from datetime import datetime from io import StringIO @@ -7,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm -from pandas.io.common import _get_handle +from pandas.io.common import get_handle class TestSeriesToCSV: @@ -25,24 +24,6 @@ def read_csv(self, path, **kwargs): return out - @pytest.mark.parametrize("arg", ["path", "header", "both"]) - def test_to_csv_deprecation(self, arg, datetime_series): - # see gh-19715 - with tm.ensure_clean() as path: - if arg == "path": - kwargs = dict(path=path, header=False) - elif arg == "header": - kwargs = dict(path_or_buf=path) - else: # Both discrepancies match. - kwargs = dict(path=path) - - with tm.assert_produces_warning(FutureWarning): - datetime_series.to_csv(**kwargs) - - # Make sure roundtrip still works. - ts = self.read_csv(path) - tm.assert_series_equal(datetime_series, ts, check_names=False) - def test_from_csv(self, datetime_series, string_series): with tm.ensure_clean() as path: @@ -161,7 +142,7 @@ def test_to_csv_compression(self, s, encoding, compression): tm.assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle( + f, _handles = get_handle( filename, "w", compression=compression, encoding=encoding ) with f: @@ -234,15 +215,6 @@ def test_pickle_preserve_name(self): unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) assert unpickled.name == n - def test_pickle_categorical_ordered_from_sentinel(self): - # GH 27295: can remove test when _ordered_from_sentinel is removed (GH 26403) - s = Series(["a", "b", "c", "a"], dtype="category") - result = tm.round_trip_pickle(s) - result = result.astype("category") - - tm.assert_series_equal(result, s) - assert result.dtype._ordered_from_sentinel is False - def _pickle_roundtrip_name(self, obj): with tm.ensure_clean() as path: @@ -266,15 +238,3 @@ class SubclassedFrame(DataFrame): assert isinstance(result, SubclassedFrame) expected = SubclassedFrame({"X": [1, 2, 3]}) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) - ) - def test_to_dict(self, mapping, datetime_series): - # GH16122 - tm.assert_series_equal( - Series(datetime_series.to_dict(mapping), name="ts"), datetime_series - ) - from_method = Series(datetime_series.to_dict(collections.Counter)) - from_constructor = Series(collections.Counter(datetime_series.items())) - tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0751e1fb8b906..7b6d9210ed3d9 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -16,30 +16,12 @@ MultiIndex, NaT, Series, + Timedelta, Timestamp, date_range, isna, ) -from pandas.core.series import remove_na -import pandas.util.testing as tm - - -def _skip_if_no_pchip(): - try: - from scipy.interpolate import pchip_interpolate # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.pchip missing") - - -def _skip_if_no_akima(): - try: - from scipy.interpolate import Akima1DInterpolator # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.Akima1DInterpolator missing") +import pandas._testing as tm def _simple_ts(start, end, freq="D"): @@ -48,11 +30,6 @@ def _simple_ts(start, end, freq="D"): class TestSeriesMissingData: - def test_remove_na_deprecation(self): - # see gh-16971 - with tm.assert_produces_warning(FutureWarning): - remove_na(Series([])) - def test_timedelta_fillna(self): # GH 3371 s = Series( @@ -66,8 +43,7 @@ def test_timedelta_fillna(self): td = s.diff() # reg fillna - with tm.assert_produces_warning(FutureWarning): - result = td.fillna(0) + result = td.fillna(Timedelta(seconds=0)) expected = Series( [ timedelta(0), @@ -79,8 +55,10 @@ def test_timedelta_fillna(self): tm.assert_series_equal(result, expected) # interpreted as seconds, deprecated - with tm.assert_produces_warning(FutureWarning): - result = td.fillna(1) + with pytest.raises(TypeError, match="Passing integers to fillna"): + td.fillna(1) + + result = td.fillna(Timedelta(seconds=1)) expected = Series( [ timedelta(seconds=1), @@ -128,16 +106,14 @@ def test_timedelta_fillna(self): # ffill td[2] = np.nan result = td.ffill() - with tm.assert_produces_warning(FutureWarning): - expected = td.fillna(0) + expected = td.fillna(Timedelta(seconds=0)) expected[0] = np.nan tm.assert_series_equal(result, expected) # bfill td[2] = np.nan result = td.bfill() - with tm.assert_produces_warning(FutureWarning): - expected = td.fillna(0) + expected = td.fillna(Timedelta(seconds=0)) expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) tm.assert_series_equal(result, expected) @@ -299,7 +275,7 @@ def test_datetime64_tz_fillna(self): ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz ) s = pd.Series(idx) - assert s.dtype == "datetime64[ns, {0}]".format(tz) + assert s.dtype == f"datetime64[ns, {tz}]" tm.assert_series_equal(pd.isna(s), null_loc) result = s.fillna(pd.Timestamp("2011-01-02 10:00")) @@ -479,6 +455,13 @@ def test_fillna_consistency(self): s2[1] = "foo" tm.assert_series_equal(s2, expected) + def test_where_sparse(self): + # GH#17198 make sure we dont get an AttributeError for sp_index + ser = pd.Series(pd.arrays.SparseArray([1, 2])) + result = ser.where(ser >= 2, 0) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) + tm.assert_series_equal(result, expected) + def test_datetime64tz_fillna_round_issue(self): # GH 14872 @@ -519,11 +502,11 @@ def test_fillna_int(self): def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) @@ -610,11 +593,11 @@ def test_fillna_categorical_raise(self): with pytest.raises(ValueError, match="fill value must be in categories"): s.fillna({1: "d", 3: "a"}) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna(["a", "b"]) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna(("a", "b")) @@ -716,7 +699,7 @@ def test_fillna(self, datetime_series): tm.assert_series_equal(result, expected) result = s1.fillna({}) tm.assert_series_equal(result, s1) - result = s1.fillna(Series(())) + result = s1.fillna(Series((), dtype=object)) tm.assert_series_equal(result, s1) result = s2.fillna(s1) tm.assert_series_equal(result, s2) @@ -840,7 +823,8 @@ def test_timedelta64_nan(self): # tm.assert_series_equal(selector, expected) def test_dropna_empty(self): - s = Series([]) + s = Series([], dtype=object) + assert len(s.dropna()) == 0 s.dropna(inplace=True) assert len(s) == 0 @@ -1097,7 +1081,6 @@ def test_interpolate_time_raises_for_non_timeseries(self): @td.skip_if_no_scipy def test_interpolate_pchip(self): - _skip_if_no_pchip() ser = Series(np.sort(np.random.uniform(size=100))) @@ -1111,7 +1094,6 @@ def test_interpolate_pchip(self): @td.skip_if_no_scipy def test_interpolate_akima(self): - _skip_if_no_akima() ser = Series([10, 11, 12, 13]) @@ -1169,7 +1151,7 @@ def test_interpolate_corners(self, kwargs): s = Series([np.nan, np.nan]) tm.assert_series_equal(s.interpolate(**kwargs), s) - s = Series([]).interpolate() + s = Series([], dtype=object).interpolate() tm.assert_series_equal(s.interpolate(**kwargs), s) def test_interpolate_index_values(self): @@ -1198,8 +1180,8 @@ def test_interpolate_index_values(self): def test_interpolate_non_ts(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) msg = ( - "time-weighted interpolation only works on Series or DataFrames" - " with a DatetimeIndex" + "time-weighted interpolation only works on Series or DataFrames " + "with a DatetimeIndex" ) with pytest.raises(ValueError, match=msg): s.interpolate(method="time") @@ -1302,7 +1284,7 @@ def test_interpolate_invalid_float_limit(self, nontemporal_method): def test_interp_invalid_method(self, invalid_method): s = Series([1, 3, np.nan, 12, np.nan, 25]) - msg = "method must be one of.* Got '{}' instead".format(invalid_method) + msg = f"method must be one of.* Got '{invalid_method}' instead" with pytest.raises(ValueError, match=msg): s.interpolate(method=invalid_method) @@ -1603,12 +1585,6 @@ def test_series_interpolate_intraday(self): tm.assert_numpy_array_equal(result.values, exp.values) - def test_nonzero_warning(self): - # GH 24048 - ser = pd.Series([1, 0, 3, 4]) - with tm.assert_produces_warning(FutureWarning): - ser.nonzero() - @pytest.mark.parametrize( "ind", [ @@ -1623,7 +1599,7 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): method, kwargs = interp_methods_ind if method == "pchip": - _skip_if_no_pchip() + pytest.importorskip("scipy") if method == "linear": result = df[0].interpolate(**kwargs) @@ -1632,9 +1608,9 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): else: expected_error = ( "Index column must be numeric or datetime type when " - "using {method} method other than linear. " + f"using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method) + "interpolating." ) with pytest.raises(ValueError, match=expected_error): df[0].interpolate(method=method, **kwargs) @@ -1651,7 +1627,7 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): method, kwargs = interp_methods_ind if method == "pchip": - _skip_if_no_pchip() + pytest.importorskip("scipy") if method in {"linear", "pchip"}: result = df[0].interpolate(method=method, **kwargs) @@ -1661,3 +1637,14 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): pytest.skip( "This interpolation method is not supported for Timedelta Index yet." ) + + @pytest.mark.parametrize( + "ascending, expected_values", + [(True, [1, 2, 3, 9, 10]), (False, [10, 9, 3, 2, 1])], + ) + def test_interpolate_unsorted_index(self, ascending, expected_values): + # GH 21037 + ts = pd.Series(data=[10, 9, np.nan, 2, 1], index=[10, 9, 3, 2, 1]) + result = ts.sort_index(ascending=ascending).interpolate(method="index") + expected = pd.Series(data=expected_values, index=expected_values, dtype=float) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 7d212ee7cd667..bdd9f92d92d3f 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +import pandas._testing as tm from pandas.core import ops import pandas.core.nanops as nanops -import pandas.util.testing as tm class TestSeriesLogicalOps: @@ -33,7 +33,7 @@ def test_logical_operators_bool_dtype_with_empty(self): s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) - s_empty = Series([]) + s_empty = Series([], dtype=object) res = s_tft & s_empty expected = s_fff @@ -43,6 +43,42 @@ def test_logical_operators_bool_dtype_with_empty(self): expected = s_tft tm.assert_series_equal(res, expected) + @pytest.mark.parametrize( + "left, right, op, expected", + [ + ( + [True, False, np.nan], + [True, False, True], + operator.and_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.and_, + [True, False, False], + ), + ( + [True, False, np.nan], + [True, False, True], + operator.or_, + [True, False, False], + ), + ( + [True, False, True], + [True, False, np.nan], + operator.or_, + [True, False, True], + ), + ], + ) + def test_logical_operators_nans(self, left, right, op, expected): + # GH 13896 + result = op(Series(left), Series(right)) + expected = Series(expected) + + tm.assert_series_equal(result, expected) + def test_logical_operators_int_dtype_with_int_dtype(self): # GH#9016: support bitwise op for integer types @@ -372,11 +408,13 @@ def test_logical_ops_label_based(self): # filling # vs empty - result = a & Series([]) + empty = Series([], dtype=object) + + result = a & empty.copy() expected = Series([False, False, False], list("bca")) tm.assert_series_equal(result, expected) - result = a | Series([]) + result = a | empty.copy() expected = Series([True, False, True], list("bca")) tm.assert_series_equal(result, expected) @@ -392,7 +430,7 @@ def test_logical_ops_label_based(self): # identity # we would like s[s|e] == s to hold for any e, whether empty or not for e in [ - Series([]), + empty.copy(), Series([1], ["z"]), Series(np.nan, b.index), Series(np.nan, a.index), @@ -761,12 +799,12 @@ def test_ops_datetimelike_align(self): tm.assert_series_equal(result, expected) def test_operators_corner(self, datetime_series): - empty = Series([], index=Index([])) + empty = Series([], index=Index([]), dtype=np.float64) result = datetime_series + empty assert np.isnan(result).all() - result = empty + Series([], index=Index([])) + result = empty + empty.copy() assert len(result) == 0 # TODO: this returned NotImplemented earlier, what to do? diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 4aeb211170d8f..03fee389542e3 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Period, Series, period_range +import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.util.testing as tm class TestSeriesPeriod: diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 9f881f5a5aa29..64a8c4569406e 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -8,14 +8,14 @@ Categorical, DataFrame, Index, + MultiIndex, Series, date_range, option_context, period_range, timedelta_range, ) -from pandas.core.index import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesRepr: @@ -62,7 +62,7 @@ def test_name_printing(self): s.name = None assert "Name:" not in repr(s) - s = Series(index=date_range("20010101", "20020101"), name="test") + s = Series(index=date_range("20010101", "20020101"), name="test", dtype=object) assert "Name: test" in repr(s) def test_repr(self, datetime_series, string_series, object_series): @@ -75,7 +75,7 @@ def test_repr(self, datetime_series, string_series, object_series): str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1))) # empty - str(Series()) + str(Series(dtype=object)) # with NaNs string_series[5:7] = np.NaN @@ -227,7 +227,7 @@ class County: name = "San Sebastián" state = "PR" - def __repr__(self): + def __repr__(self) -> str: return self.name + ", " + self.state cat = pd.Categorical([County() for _ in range(61)]) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 6b82f890e974b..73247bbf8b3d6 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSubclassing: @@ -32,4 +32,6 @@ def test_subclass_unstack(self): tm.assert_frame_equal(res, exp) def test_subclass_empty_repr(self): - assert "SubclassedSeries" in repr(tm.SubclassedSeries()) + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + sub_series = tm.SubclassedSeries() + assert "SubclassedSeries" in repr(sub_series) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 7154975c6c73b..a2d14f27d7b7a 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -7,25 +7,21 @@ from pandas._libs.tslib import iNaT from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas.errors import NullFrequencyError import pandas.util._test_decorators as td import pandas as pd from pandas import ( DataFrame, - Index, + DatetimeIndex, NaT, Series, Timestamp, concat, date_range, - offsets, timedelta_range, to_datetime, ) -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, BMonthEnd @@ -42,277 +38,6 @@ def assert_range_equal(left, right): class TestTimeSeries: - def test_shift(self, datetime_series): - shifted = datetime_series.shift(1) - unshifted = shifted.shift(-1) - - tm.assert_index_equal(shifted.index, datetime_series.index) - tm.assert_index_equal(unshifted.index, datetime_series.index) - tm.assert_numpy_array_equal( - unshifted.dropna().values, datetime_series.values[:-1] - ) - - offset = BDay() - shifted = datetime_series.shift(1, freq=offset) - unshifted = shifted.shift(-1, freq=offset) - - tm.assert_series_equal(unshifted, datetime_series) - - unshifted = datetime_series.shift(0, freq=offset) - tm.assert_series_equal(unshifted, datetime_series) - - shifted = datetime_series.shift(1, freq="B") - unshifted = shifted.shift(-1, freq="B") - - tm.assert_series_equal(unshifted, datetime_series) - - # corner case - unshifted = datetime_series.shift(0) - tm.assert_series_equal(unshifted, datetime_series) - - # Shifting with PeriodIndex - ps = tm.makePeriodSeries() - shifted = ps.shift(1) - unshifted = shifted.shift(-1) - tm.assert_index_equal(shifted.index, ps.index) - tm.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) - - shifted2 = ps.shift(1, "B") - shifted3 = ps.shift(1, BDay()) - tm.assert_series_equal(shifted2, shifted3) - tm.assert_series_equal(ps, shifted2.shift(-1, "B")) - - msg = "Given freq D does not match PeriodIndex freq B" - with pytest.raises(ValueError, match=msg): - ps.shift(freq="D") - - # legacy support - shifted4 = ps.shift(1, freq="B") - tm.assert_series_equal(shifted2, shifted4) - - shifted5 = ps.shift(1, freq=BDay()) - tm.assert_series_equal(shifted5, shifted4) - - # 32-bit taking - # GH 8129 - index = date_range("2000-01-01", periods=5) - for dtype in ["int32", "int64"]: - s1 = Series(np.arange(5, dtype=dtype), index=index) - p = s1.iloc[1] - result = s1.shift(periods=p) - expected = Series([np.nan, 0, 1, 2, 3], index=index) - tm.assert_series_equal(result, expected) - - # xref 8260 - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s - s.shift() - - exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - tm.assert_series_equal(result, exp) - - # incompat tz - s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") - msg = "DatetimeArray subtraction must have the same timezones or no timezones" - with pytest.raises(TypeError, match=msg): - s - s2 - - def test_shift2(self): - ts = Series( - np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") - ) - - result = ts.shift(1, freq="5T") - exp_index = ts.index.shift(1, freq="5T") - tm.assert_index_equal(result.index, exp_index) - - # GH #1063, multiple of same base - result = ts.shift(1, freq="4H") - exp_index = ts.index + offsets.Hour(4) - tm.assert_index_equal(result.index, exp_index) - - idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) - msg = "Cannot shift with no freq" - with pytest.raises(NullFrequencyError, match=msg): - idx.shift(1) - - def test_shift_fill_value(self): - # GH #24128 - ts = Series( - [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - - exp = Series( - [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - # check that fill value works - result = ts.shift(1, fill_value=0.0) - tm.assert_series_equal(result, exp) - - exp = Series( - [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") - ) - result = ts.shift(2, fill_value=0.0) - tm.assert_series_equal(result, exp) - - ts = pd.Series([1, 2, 3]) - res = ts.shift(2, fill_value=0) - assert res.dtype == ts.dtype - - def test_categorical_shift_fill_value(self): - ts = pd.Series(["a", "b", "c", "d"], dtype="category") - res = ts.shift(1, fill_value="a") - expected = pd.Series( - pd.Categorical( - ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False - ) - ) - tm.assert_equal(res, expected) - - # check for incorrect fill_value - msg = "'fill_value=f' is not present in this Categorical's categories" - with pytest.raises(ValueError, match=msg): - ts.shift(1, fill_value="f") - - def test_shift_dst(self): - # GH 13926 - dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") - s = Series(dates) - - res = s.shift(0) - tm.assert_series_equal(res, s) - assert res.dtype == "datetime64[ns, US/Eastern]" - - res = s.shift(1) - exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] - exp = Series(exp_vals) - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - res = s.shift(-2) - exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] - exp = Series(exp_vals) - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - for ex in [10, -10, 20, -20]: - res = s.shift(ex) - exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") - tm.assert_series_equal(res, exp) - assert res.dtype == "datetime64[ns, US/Eastern]" - - def test_tshift(self, datetime_series): - # PeriodIndex - ps = tm.makePeriodSeries() - shifted = ps.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_series_equal(unshifted, ps) - - shifted2 = ps.tshift(freq="B") - tm.assert_series_equal(shifted, shifted2) - - shifted3 = ps.tshift(freq=BDay()) - tm.assert_series_equal(shifted, shifted3) - - msg = "Given freq M does not match PeriodIndex freq B" - with pytest.raises(ValueError, match=msg): - ps.tshift(freq="M") - - # DatetimeIndex - shifted = datetime_series.tshift(1) - unshifted = shifted.tshift(-1) - - tm.assert_series_equal(datetime_series, unshifted) - - shifted2 = datetime_series.tshift(freq=datetime_series.index.freq) - tm.assert_series_equal(shifted, shifted2) - - inferred_ts = Series( - datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" - ) - shifted = inferred_ts.tshift(1) - unshifted = shifted.tshift(-1) - tm.assert_series_equal(shifted, datetime_series.tshift(1)) - tm.assert_series_equal(unshifted, inferred_ts) - - no_freq = datetime_series[[0, 5, 7]] - msg = "Freq was not given and was not set in the index" - with pytest.raises(ValueError, match=msg): - no_freq.tshift() - - def test_truncate(self, datetime_series): - offset = BDay() - - ts = datetime_series[::3] - - start, end = datetime_series.index[3], datetime_series.index[6] - start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] - - # neither specified - truncated = ts.truncate() - tm.assert_series_equal(truncated, ts) - - # both specified - expected = ts[1:3] - - truncated = ts.truncate(start, end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(start_missing, end_missing) - tm.assert_series_equal(truncated, expected) - - # start specified - expected = ts[1:] - - truncated = ts.truncate(before=start) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(before=start_missing) - tm.assert_series_equal(truncated, expected) - - # end specified - expected = ts[:3] - - truncated = ts.truncate(after=end) - tm.assert_series_equal(truncated, expected) - - truncated = ts.truncate(after=end_missing) - tm.assert_series_equal(truncated, expected) - - # corner case, empty series returned - truncated = ts.truncate(after=datetime_series.index[0] - offset) - assert len(truncated) == 0 - - truncated = ts.truncate(before=datetime_series.index[-1] + offset) - assert len(truncated) == 0 - - msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" - with pytest.raises(ValueError, match=msg): - ts.truncate( - before=datetime_series.index[-1] + offset, - after=datetime_series.index[0] - offset, - ) - - def test_truncate_nonsortedindex(self): - # GH 17935 - - s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - s.truncate(before=3, after=9) - - rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") - ts = pd.Series(np.random.randn(len(rng)), index=rng) - msg = "truncate requires a sorted index" - - with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") - def test_asfreq(self): ts = Series( [0.0, 1.0, 2.0], @@ -346,65 +71,11 @@ def test_asfreq(self): def test_asfreq_datetimeindex_empty_series(self): # GH 14320 - expected = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"])).asfreq("H") - result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), data=[3]).asfreq( - "H" - ) + index = pd.DatetimeIndex(["2016-09-29 11:00"]) + expected = Series(index=index, dtype=object).asfreq("H") + result = Series([3], index=index.copy()).asfreq("H") tm.assert_index_equal(expected.index, result.index) - def test_pct_change(self, datetime_series): - rs = datetime_series.pct_change(fill_method=None) - tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) - - rs = datetime_series.pct_change(2) - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_series.pct_change(fill_method="bfill", limit=1) - filled = datetime_series.fillna(method="bfill", limit=1) - tm.assert_series_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_series.pct_change(freq="5D") - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - chg = s.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - tm.assert_series_equal(chg, expected) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, freq, periods, fill_method, limit, datetime_series - ): - # GH 7292 - rs_freq = datetime_series.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_series.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_series_equal(rs_freq, rs_periods) - - empty_ts = Series(index=datetime_series.index) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_series_equal(rs_freq, rs_periods) - def test_autocorr(self, datetime_series): # Just run the function corr1 = datetime_series.autocorr() @@ -447,12 +118,12 @@ def test_first_last_valid(self, datetime_series): assert ts.last_valid_index() is None assert ts.first_valid_index() is None - ser = Series([], index=[]) + ser = Series([], index=[], dtype=object) assert ser.last_valid_index() is None assert ser.first_valid_index() is None # GH12800 - empty = Series() + empty = Series(dtype=object) assert empty.last_valid_index() is None assert empty.first_valid_index() is None @@ -466,7 +137,9 @@ def test_first_last_valid(self, datetime_series): assert ts.last_valid_index().freq == ts.index.freq def test_mpl_compat_hack(self, datetime_series): - result = datetime_series[:, np.newaxis] + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 multi-dimensional indexing deprecated + result = datetime_series[:, np.newaxis] expected = datetime_series.values[:, np.newaxis] tm.assert_almost_equal(result, expected) @@ -594,7 +267,7 @@ def test_asfreq_keep_index_name(self): # GH #9854 index_name = "bar" index = pd.date_range("20130101", periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=["foo"], index=index) + df = pd.DataFrame(list(range(20)), columns=["foo"], index=index) assert index_name == df.index.name assert index_name == df.asfreq("10D").index.name @@ -721,6 +394,7 @@ def test_at_time(self): expected = ts[(rng.hour == 9) & (rng.minute == 30)] exp_df = df[(rng.hour == 9) & (rng.minute == 30)] + # FIXME: dont leave commented-out # expected.index = date_range('1/1/2000', '1/4/2000') tm.assert_series_equal(result, expected) @@ -829,10 +503,7 @@ def test_between_time_raises(self): def test_between_time_types(self): # GH11818 rng = date_range("1/1/2000", "1/5/2000", freq="5min") - msg = ( - r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" - " to a time" - ) + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" with pytest.raises(ValueError, match=msg): rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) @@ -1030,10 +701,6 @@ def test_from_M8_structured(self): assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = Series.from_array(arr["Date"], Index([0])) - assert s[0] == dates[0][0] - def test_get_level_values_box(self): from pandas import MultiIndex @@ -1063,14 +730,12 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. ser = pd.Series(pd.date_range("2000", periods=2)) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - with tm.assert_produces_warning(None): - result = np.asarray(ser) + result = np.asarray(ser) tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -1079,15 +744,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(ser) + result = np.asarray(ser, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype="M8[ns]") + result = np.asarray(ser, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -1095,7 +757,6 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index c16e2864b131f..a363f927d10a9 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -11,8 +11,8 @@ from pandas._libs.tslibs import conversion, timezones from pandas import DatetimeIndex, Index, NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestSeriesTimezones: @@ -33,21 +33,6 @@ def test_series_tz_localize(self): with pytest.raises(TypeError, match="Already tz-aware"): ts.tz_localize("US/Eastern") - @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_tz_localize_errors_deprecation(self): - # GH 22644 - tz = "Europe/Warsaw" - n = 60 - rng = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") - ts = Series(rng) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with pytest.raises(ValueError): - ts.dt.tz_localize(tz, errors="foo") - # make sure errors='coerce' gets mapped correctly to nonexistent - result = ts.dt.tz_localize(tz, errors="coerce") - expected = ts.dt.tz_localize(tz, nonexistent="NaT") - tm.assert_series_equal(result, expected) - def test_series_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous @@ -104,7 +89,7 @@ def test_series_tz_localize_nonexistent(self, tz, method, exp): @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_series_tz_localize_empty(self, tzstr): # GH#2248 - ser = Series() + ser = Series(dtype=object) ser2 = ser.tz_localize("utc") assert ser2.index.tz == pytz.utc diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 8144a3931b9b8..ece7f1f21ab23 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,7 +5,8 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.arrays import SparseArray UNARY_UFUNCS = [np.positive, np.floor, np.exp] BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op @@ -33,7 +34,7 @@ def test_unary_ufunc(ufunc, sparse): array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype("int64", 0)) + array = SparseArray(array, dtype=pd.SparseDtype("int64", 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -51,8 +52,8 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -79,8 +80,8 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -110,8 +111,8 @@ def test_binary_ufunc_with_series( # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -149,7 +150,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): # * ufunc(Series, scalar) == ufunc(scalar, Series) array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) other = 2 series = pd.Series(array, name="name") @@ -183,8 +184,8 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -209,7 +210,7 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) series = pd.Series(array, name="name") result = np.modf(series) @@ -251,7 +252,7 @@ def __add__(self, other): @pytest.mark.parametrize( "values", [ - pd.array([1, 3, 2]), + pd.array([1, 3, 2], dtype="int64"), pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), @@ -282,10 +283,10 @@ def __add__(self, other): other = getattr(other, "value", other) return type(self)(self.value + other) - def __eq__(self, other): + def __eq__(self, other) -> bool: return type(other) is Thing and self.value == other.value - def __repr__(self): + def __repr__(self) -> str: return "Thing({})".format(self.value) s = pd.Series([Thing(1), Thing(2)]) @@ -299,7 +300,5 @@ def test_outer(): s = pd.Series([1, 2, 3]) o = np.array([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): - result = np.subtract.outer(s, o) - expected = np.array([[0, -1, -2], [1, 0, -1], [2, 1, 0]], dtype=np.dtype("int64")) - tm.assert_numpy_array_equal(result, expected) + with pytest.raises(NotImplementedError): + np.subtract.outer(s, o) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9dd88fd5dd25b..2b46f86d49c5e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -10,6 +10,13 @@ from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_complex_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype as CDT import pandas as pd @@ -23,49 +30,49 @@ Timestamp, compat, ) +import pandas._testing as tm +from pandas.conftest import BYTES_DTYPES, STRING_DTYPES import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com -from pandas.core.sorting import safe_sort -import pandas.util.testing as tm class TestFactorize: def test_basic(self): - labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - labels, uniques = algos.factorize( + codes, uniques = algos.factorize( ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True ) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(range(5)))) + codes, uniques = algos.factorize(list(reversed(range(5)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) + codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) @@ -73,16 +80,16 @@ def test_mixed(self): # doc example reshaping.rst x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = Index(["A", "B", 3.14, np.inf]) tm.assert_index_equal(uniques, exp) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = Index([3.14, np.inf, "A", "B"]) tm.assert_index_equal(uniques, exp) @@ -92,16 +99,16 @@ def test_datelike(self): v1 = Timestamp("20130101 09:00:00.00004") v2 = Timestamp("20130101") x = Series([v1, v1, v1, v2, v2, v1]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = DatetimeIndex([v1, v2]) tm.assert_index_equal(uniques, exp) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = DatetimeIndex([v2, v1]) tm.assert_index_equal(uniques, exp) @@ -111,28 +118,28 @@ def test_datelike(self): x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 v1 = pd.to_timedelta("1 day 1 min") v2 = pd.to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) def test_factorize_nan(self): @@ -159,7 +166,7 @@ def test_factorize_nan(self): tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) @pytest.mark.parametrize( - "data,expected_label,expected_level", + "data, expected_codes, expected_uniques", [ ( [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], @@ -174,14 +181,14 @@ def test_factorize_nan(self): ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), ], ) - def test_factorize_tuple_list(self, data, expected_label, expected_level): + def test_factorize_tuple_list(self, data, expected_codes, expected_uniques): # GH9454 - result = pd.factorize(data) + codes, uniques = pd.factorize(data) - tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) + tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp)) - expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) - tm.assert_numpy_array_equal(result[1], expected_level_array) + expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object) + tm.assert_numpy_array_equal(uniques, expected_uniques_array) def test_complex_sorting(self): # gh 12666 - check no segfault @@ -198,58 +205,58 @@ def test_complex_sorting(self): def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) - exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) + expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) + expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_uint64_factorize(self, writable): data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_int64_factorize(self, writable): - data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -(2 ** 63), 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 63 - 1, -(2 ** 63)], dtype=np.int64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_string_factorize(self, writable): data = np.array(["a", "c", "a", "b", "c"], dtype=object) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) - exp_uniques = np.array(["a", "c", "b"], dtype=object) + expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_object_factorize(self, writable): data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) data.setflags(write=writable) - exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) - exp_uniques = np.array(["a", "c", "b"], dtype=object) + expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) - with tm.assert_produces_warning(expected_warning=FutureWarning): + with pytest.raises(TypeError, match="got an unexpected keyword"): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) @@ -258,36 +265,36 @@ def test_deprecate_order(self): "data", [ np.array([0, 1, 0], dtype="u8"), - np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array([-(2 ** 63), 1, -(2 ** 63)], dtype="i8"), np.array(["__nan__", "foo", "__nan__"], dtype="object"), ], ) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. - l, u = algos.factorize(data) + codes, uniques = algos.factorize(data) expected_uniques = data[[0, 1]] - expected_labels = np.array([0, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(l, expected_labels) - tm.assert_numpy_array_equal(u, expected_uniques) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( "data, na_value", [ (np.array([0, 1, 0, 2], dtype="u8"), 0), (np.array([1, 0, 1, 2], dtype="u8"), 1), - (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), - (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array([-(2 ** 63), 1, -(2 ** 63), 0], dtype="i8"), -(2 ** 63)), + (np.array([1, -(2 ** 63), 1, 0], dtype="i8"), 1), (np.array(["a", "", "a", "b"], dtype=object), "a"), (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), ], ) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = algos._factorize_array(data, na_value=na_value) + codes, uniques = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] - expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp) - tm.assert_numpy_array_equal(l, expected_labels) - tm.assert_numpy_array_equal(u, expected_uniques) + expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) @@ -306,14 +313,14 @@ def test_parametrized_factorize_na_value(self, data, na_value): ids=["numpy_array", "extension_array"], ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: - expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) - expected_uniques = safe_sort(uniques) + expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_uniques = algos.safe_sort(uniques) else: - expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = uniques - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) if isinstance(data, np.ndarray): tm.assert_numpy_array_equal(uniques, expected_uniques) else: @@ -353,6 +360,35 @@ def test_on_index_object(self): tm.assert_almost_equal(result, expected) + def test_dtype_preservation(self, any_numpy_dtype): + # GH 15442 + if any_numpy_dtype in (BYTES_DTYPES + STRING_DTYPES): + pytest.skip("skip string dtype") + elif is_integer_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1, 2] + elif is_float_dtype(any_numpy_dtype): + data = [1, 2, 2] + uniques = [1.0, 2.0] + elif is_complex_dtype(any_numpy_dtype): + data = [complex(1, 0), complex(2, 0), complex(2, 0)] + uniques = [complex(1, 0), complex(2, 0)] + elif is_bool_dtype(any_numpy_dtype): + data = [True, True, False] + uniques = [True, False] + elif is_object_dtype(any_numpy_dtype): + data = ["A", "B", "B"] + uniques = ["A", "B"] + else: + # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere + data = [1, 2, 2] + uniques = [1, 2] + + result = Series(data, dtype=any_numpy_dtype).unique() + expected = np.array(uniques, dtype=any_numpy_dtype) + + tm.assert_numpy_array_equal(result, expected) + def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( @@ -731,7 +767,7 @@ def test_same_object_is_in(self): # with similar behavior, then we at least should # fall back to usual python's behavior: "a in [a] == True" class LikeNan: - def __eq__(self, other): + def __eq__(self, other) -> bool: return False def __hash__(self): @@ -776,7 +812,7 @@ def test_no_cast(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) - @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) + @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])]) def test_empty(self, empty): # see gh-16991 vals = Index(["a", "b"]) @@ -1366,6 +1402,19 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable: + def test_string_hashtable_set_item_signature(self): + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault + tbl = ht.StringHashTable() + + tbl.set_item("key", 1) + assert tbl.get_item("key") == 1 + + with pytest.raises(TypeError, match="'key' has incorrect type"): + # key arg typed as string, not object + tbl.set_item(4, 6) + with pytest.raises(TypeError, match="'val' has incorrect type"): + tbl.get_item(4) + def test_lookup_nan(self, writable): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) # GH 21688 ensure we can deal with readonly memory views diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 65b2dab1b02a8..a8a0fcea7182c 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,7 +8,8 @@ import pandas as pd from pandas import Series, Timestamp -from pandas.core import common as com, ops +from pandas.core import ops +import pandas.core.common as com def test_get_callable_name(): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index fb0511f8902f7..ee006233c4c1b 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,29 +8,18 @@ import numpy as np # noqa import pytest -from pandas.compat import PY36 - from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm def import_module(name): # we *only* want to skip if the module is truly not available # and NOT just an actual import error because of pandas changes - if PY36: - try: - return importlib.import_module(name) - except ModuleNotFoundError: # noqa - pytest.skip("skipping as {} not available".format(name)) - - else: - try: - return importlib.import_module(name) - except ImportError as e: - if "No module named" in str(e) and name in str(e): - pytest.skip("skipping as {} not available".format(name)) - raise + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) @pytest.fixture @@ -66,6 +55,10 @@ def test_oo_optimizable(): @tm.network # Cython import warning @pytest.mark.filterwarnings("ignore:can't:ImportWarning") +@pytest.mark.filterwarnings( + # patsy needs to update their imports + "ignore:Using or importing the ABCs from 'collections:DeprecationWarning" +) def test_statsmodels(): statsmodels = import_module("statsmodels") # noqa @@ -113,10 +106,7 @@ def test_pandas_datareader(): # importing from pandas, Cython import warning -@pytest.mark.filterwarnings("ignore:The 'warn':DeprecationWarning") -@pytest.mark.filterwarnings("ignore:pandas.util:DeprecationWarning") @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") -@pytest.mark.skip(reason="gh-25778: geopandas stack issue") def test_geopandas(): geopandas = import_module("geopandas") # noqa diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index 531c511e8c02d..fa2142444ed92 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -39,22 +39,6 @@ def test_catch_oob(): pass -def test_error_rename(): - # see gh-12665 - from pandas.errors import ParserError - from pandas.io.common import CParserError - - try: - raise CParserError() - except ParserError: - pass - - try: - raise ParserError() - except CParserError: - pass - - class Foo: @classmethod def classmethod(cls): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 1974f712b13ee..fadab5d821470 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -5,9 +5,9 @@ from numpy.random import randn import pytest +import pandas._testing as tm from pandas.core.api import DataFrame from pandas.core.computation import expressions as expr -import pandas.util.testing as tm _frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") _frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") @@ -261,9 +261,9 @@ def testit(): def test_bool_ops_raise_on_arithmetic(self, op_str, opname): df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5}) - msg = "operator %r not implemented for bool dtypes" + msg = f"operator {repr(op_str)} not implemented for bool dtypes" f = getattr(operator, opname) - err_msg = re.escape(msg % op_str) + err_msg = re.escape(msg) with pytest.raises(NotImplementedError, match=err_msg): f(df, df) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 8940a82b33777..129dc275c4d5a 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -4,7 +4,7 @@ from pandas._libs import join as _join from pandas import Categorical, DataFrame, Index, merge -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexer: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 77841f0bb9f0d..f839aa198d03f 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -4,7 +4,7 @@ from pandas._libs import lib, writers as libwriters from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm class TestMisc: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 79c9fe2b60bd9..5382ad84bcca2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2,7 +2,6 @@ from io import StringIO import itertools from itertools import product -from warnings import catch_warnings, simplefilter import numpy as np from numpy.random import randn @@ -12,9 +11,8 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd -from pandas import DataFrame, Series, Timestamp, isna -from pandas.core.index import Index, MultiIndex -import pandas.util.testing as tm +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna +import pandas._testing as tm AGG_FUNCTIONS = [ "sum", @@ -121,7 +119,8 @@ def test_append_index(self): (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), ] - + expected_tuples + + expected_tuples, + dtype=object, ), None, ) @@ -209,11 +208,6 @@ def test_reindex(self): reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] - tm.assert_frame_equal(reindexed, expected) - def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) @@ -222,11 +216,6 @@ def test_reindex_preserve_levels(self): chunk = self.ymd.loc[new_index] assert chunk.index is new_index - with catch_warnings(record=True): - simplefilter("ignore", FutureWarning) - chunk = self.ymd.ix[new_index] - assert chunk.index is new_index - ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) assert chunk.columns is new_index @@ -257,7 +246,7 @@ def test_repr_name_coincide(self): assert lines[2].startswith("a 0 foo") def test_delevel_infer_dtype(self): - tuples = [tuple for tuple in product(["foo", "bar"], [10, 20], [1.0, 1.1])] + tuples = list(product(["foo", "bar"], [10, 20], [1.0, 1.1])) index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() @@ -358,6 +347,49 @@ def test_unstack(self): # test that int32 work self.ymd.astype(np.int32).unstack() + @pytest.mark.parametrize( + "result_rows,result_columns,index_product,expected_row", + [ + ( + [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]], + ["ix1", "ix2", "col1", "col2", "col3", "col4"], + 2, + [None, None, 30.0, None], + ), + ( + [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + 2, + [None, None, 30.0], + ), + ( + [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]], + ["ix1", "ix2", "col1", "col2", "col3"], + None, + [None, None, 30.0], + ), + ], + ) + def test_unstack_partial( + self, result_rows, result_columns, index_product, expected_row + ): + # check for regressions on this issue: + # https://github.com/pandas-dev/pandas/issues/19351 + # make sure DataFrame.unstack() works when its run on a subset of the DataFrame + # and the Index levels contain values that are not present in the subset + result = pd.DataFrame(result_rows, columns=result_columns).set_index( + ["ix1", "ix2"] + ) + result = result.iloc[1:2].unstack("ix2") + expected = pd.DataFrame( + [expected_row], + columns=pd.MultiIndex.from_product( + [result_columns[2:], [index_product]], names=[None, "ix2"] + ), + index=pd.Index([2], name="ix1"), + ) + tm.assert_frame_equal(result, expected) + def test_unstack_multiple_no_empty_columns(self): index = MultiIndex.from_tuples( [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] @@ -540,6 +572,17 @@ def test_stack_unstack_wrong_level_name(self, method): with pytest.raises(KeyError, match="does not match index name"): getattr(s, method)("mistake") + def test_unused_level_raises(self): + # GH 20410 + mi = MultiIndex( + levels=[["a_lot", "onlyone", "notevenone"], [1970, ""]], + codes=[[1, 0], [1, 0]], + ) + df = DataFrame(-1, index=range(3), columns=mi) + + with pytest.raises(KeyError, match="notevenone"): + df["notevenone"] + def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) @@ -1316,6 +1359,30 @@ def test_mixed_depth_drop(self): ) tm.assert_frame_equal(expected, result) + def test_drop_multiindex_other_level_nan(self): + # GH 12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + def test_drop_nonunique(self): df = DataFrame( [ @@ -1484,12 +1551,26 @@ def test_frame_dict_constructor_empty_series(self): s2 = Series( [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) ) - s3 = Series() + s3 = Series(dtype=object) # it works! DataFrame({"foo": s1, "bar": s2, "baz": s3}) DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) + @pytest.mark.parametrize("d", [4, "d"]) + def test_empty_frame_groupby_dtypes_consistency(self, d): + # GH 20888 + group_keys = ["a", "b", "c"] + df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]}) + + g = df[df.a == 2].groupby(group_keys) + result = g.first().index + expected = MultiIndex( + levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"] + ) + + tm.assert_index_equal(result, expected) + def test_multiindex_na_repr(self): # only an issue with long columns df3 = DataFrame( @@ -1932,6 +2013,15 @@ def test_repeat(self): m_df = Series(data, index=m_idx) assert m_df.repeat(3).shape == (3 * len(data),) + def test_subsets_multiindex_dtype(self): + # GH 20757 + data = [["x", 1]] + columns = [("a", "b", np.nan), ("a", "c", 0.0)] + df = DataFrame(data, columns=pd.MultiIndex.from_tuples(columns)) + expected = df.dtypes.a.b + result = df.a.b.dtypes + tm.assert_series_equal(result, expected) + class TestSorted(Base): """ everything you wanted to test about sorting """ @@ -2220,6 +2310,14 @@ def test_sort_index_and_reconstruction_doc_example(self): tm.assert_frame_equal(result, expected) + def test_sort_index_non_existent_label_multiindex(self): + # GH 12261 + df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []])) + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic + assert result is True + def test_sort_index_reorder_on_ops(self): # 15687 df = DataFrame( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 49d1777df0751..2c5d028ebe42e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -11,9 +11,9 @@ import pandas as pd from pandas import Series, isna +import pandas._testing as tm from pandas.core.arrays import DatetimeArray import pandas.core.nanops as nanops -import pandas.util.testing as tm use_bn = nanops._USE_BOTTLENECK has_c16 = hasattr(np, "complex128") @@ -24,7 +24,7 @@ def setup_method(self, method): np.random.seed(11235) nanops._USE_BOTTLENECK = False - arr_shape = (11, 7, 5) + arr_shape = (11, 7) self.arr_float = np.random.randn(*arr_shape) self.arr_float1 = np.random.randn(*arr_shape) @@ -68,21 +68,21 @@ def setup_method(self, method): self.arr_nan_infj = self.arr_inf * 1j self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj]) - self.arr_float_2d = self.arr_float[:, :, 0] - self.arr_float1_2d = self.arr_float1[:, :, 0] + self.arr_float_2d = self.arr_float + self.arr_float1_2d = self.arr_float1 - self.arr_nan_2d = self.arr_nan[:, :, 0] - self.arr_float_nan_2d = self.arr_float_nan[:, :, 0] - self.arr_float1_nan_2d = self.arr_float1_nan[:, :, 0] - self.arr_nan_float1_2d = self.arr_nan_float1[:, :, 0] + self.arr_nan_2d = self.arr_nan + self.arr_float_nan_2d = self.arr_float_nan + self.arr_float1_nan_2d = self.arr_float1_nan + self.arr_nan_float1_2d = self.arr_nan_float1 - self.arr_float_1d = self.arr_float[:, 0, 0] - self.arr_float1_1d = self.arr_float1[:, 0, 0] + self.arr_float_1d = self.arr_float[:, 0] + self.arr_float1_1d = self.arr_float1[:, 0] - self.arr_nan_1d = self.arr_nan[:, 0, 0] - self.arr_float_nan_1d = self.arr_float_nan[:, 0, 0] - self.arr_float1_nan_1d = self.arr_float1_nan[:, 0, 0] - self.arr_nan_float1_1d = self.arr_nan_float1[:, 0, 0] + self.arr_nan_1d = self.arr_nan[:, 0] + self.arr_float_nan_1d = self.arr_float_nan[:, 0] + self.arr_float1_nan_1d = self.arr_float1_nan[:, 0] + self.arr_nan_float1_1d = self.arr_nan_float1[:, 0] def teardown_method(self, method): nanops._USE_BOTTLENECK = use_bn @@ -151,7 +151,7 @@ def check_fun_data( targarval, check_dtype=True, empty_targfunc=None, - **kwargs + **kwargs, ): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: @@ -186,7 +186,7 @@ def check_fun_data( targarval2, check_dtype=check_dtype, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): @@ -203,7 +203,7 @@ def check_fun(self, testfunc, targfunc, testar, empty_targfunc=None, **kwargs): testarval, targarval, empty_targfunc=empty_targfunc, - **kwargs + **kwargs, ) def check_funs( @@ -215,7 +215,7 @@ def check_funs( allow_date=True, allow_tdelta=True, allow_obj=True, - **kwargs + **kwargs, ): self.check_fun(testfunc, targfunc, "arr_float", **kwargs) self.check_fun(testfunc, targfunc, "arr_float_nan", **kwargs) @@ -302,7 +302,7 @@ def test_nanmean_overflow(self): # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy - for a in [2 ** 55, -2 ** 55, 20150515061816532]: + for a in [2 ** 55, -(2 ** 55), 20150515061816532]: s = Series(a, index=range(500), dtype=np.int64) result = s.mean() np_result = s.values.mean() @@ -476,7 +476,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -486,7 +486,7 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_float1_nan_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -500,13 +500,13 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_2d, self.arr_nan_float1_2d, min_periods=len(self.arr_float_2d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_2d, self.arr_float1_2d, min_periods=len(self.arr_float_2d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) @@ -521,7 +521,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) @@ -531,7 +531,7 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_float1_nan_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -545,13 +545,13 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): self.arr_float_nan_1d, self.arr_nan_float1_1d, min_periods=len(self.arr_float_1d) - 1, - **kwargs + **kwargs, ) res25 = checkfun( self.arr_float_1d, self.arr_float1_1d, min_periods=len(self.arr_float_1d) + 1, - **kwargs + **kwargs, ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) @@ -598,6 +598,14 @@ def test_nancorr_spearman(self): targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") + @td.skip_if_no_scipy + def test_invalid_method(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + with pytest.raises(ValueError, match=msg): + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") + def test_nancov(self): targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] @@ -704,46 +712,6 @@ def test__has_infs(self): self.check_bool(nanops._has_infs, val.astype("f4"), correct) self.check_bool(nanops._has_infs, val.astype("f2"), correct) - def test__isfinite(self): - pairs = [ - ("arr_complex", False), - ("arr_int", False), - ("arr_bool", False), - ("arr_str", False), - ("arr_utf", False), - ("arr_complex", False), - ("arr_complex_nan", True), - ("arr_nan_nanj", True), - ("arr_nan_infj", True), - ("arr_complex_nan_infj", True), - ] - pairs_float = [ - ("arr_float", False), - ("arr_nan", True), - ("arr_float_nan", True), - ("arr_nan_nan", True), - ("arr_float_inf", True), - ("arr_inf", True), - ("arr_nan_inf", True), - ("arr_float_nan_inf", True), - ("arr_nan_nan_inf", True), - ] - - func1 = lambda x: np.any(nanops._isfinite(x).ravel()) - - # TODO: unused? - # func2 = lambda x: np.any(nanops._isfinite(x).values.ravel()) - - for arr, correct in pairs: - val = getattr(self, arr) - self.check_bool(func1, val, correct) - - for arr, correct in pairs_float: - val = getattr(self, arr) - self.check_bool(func1, val, correct) - self.check_bool(func1, val.astype("f4"), correct) - self.check_bool(func1, val.astype("f2"), correct) - def test__bn_ok_dtype(self): assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index cd154ed5fe570..ce527214e55e7 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -5,7 +5,7 @@ from pandas.compat._optional import VERSIONS, import_optional_dependency -import pandas.util.testing as tm +import pandas._testing as tm def test_import_optional(): diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 97086f8ab1e85..08a5581886522 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @contextlib.contextmanager @@ -45,7 +45,8 @@ def test_register(obj, registrar): with ensure_removed(obj, "mine"): before = set(dir(obj)) registrar("mine")(MyAccessor) - assert obj([]).mine.prop == "item" + o = obj([]) if obj is not pd.Series else obj([], dtype=object) + assert o.mine.prop == "item" after = set(dir(obj)) assert (before ^ after) == {"mine"} assert "mine" in obj._accessors @@ -88,4 +89,4 @@ def __init__(self, data): raise AttributeError("whoops") with pytest.raises(AttributeError, match="whoops"): - pd.Series([]).bad + pd.Series([], dtype=object).bad diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 32aeb7b186827..98297474243e4 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -6,16 +6,16 @@ import pytest from pandas import DataFrame, MultiIndex, Series, array, concat, merge -from pandas.core import common as com +import pandas._testing as tm +from pandas.core.algorithms import safe_sort +import pandas.core.common as com from pandas.core.sorting import ( decons_group_index, get_group_index, is_int64_overflow_possible, lexsort_indexer, nargsort, - safe_sort, ) -import pandas.util.testing as tm class TestSorting: @@ -314,27 +314,27 @@ def verify_order(df): def test_decons(): - def testit(label_list, shape): - group_index = get_group_index(label_list, shape, sort=True, xnull=True) - label_list2 = decons_group_index(group_index, shape) + def testit(codes_list, shape): + group_index = get_group_index(codes_list, shape, sort=True, xnull=True) + codes_list2 = decons_group_index(group_index, shape) - for a, b in zip(label_list, label_list2): + for a, b in zip(codes_list, codes_list2): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [ + codes_list = [ np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), ] - testit(label_list, shape) + testit(codes_list, shape) shape = (10000, 10000) - label_list = [ + codes_list = [ np.tile(np.arange(10000, dtype=np.int64), 5), np.tile(np.arange(10000, dtype=np.int64), 5), ] - testit(label_list, shape) + testit(codes_list, shape) class TestSafeSort: @@ -355,42 +355,42 @@ def test_basic_sort(self): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - def test_labels(self, verify): + def test_codes(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) - labels = [0, 1, 1, 2, 3, 0, -1, 4] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) # na_sentinel - labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) - expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + codes = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify) + expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) - labels = [] - result, result_labels = safe_sort(values, labels, verify=verify) - expected_labels = np.array([], dtype=np.intp) + codes = [] + result, result_codes = safe_sort(values, codes, verify=verify) + expected_codes = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_labels_out_of_bound(self, na_sentinel): + def test_codes_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices - labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) - expected_labels = np.array( + codes = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) + expected_codes = np.array( [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp ) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer(self): values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) @@ -399,12 +399,12 @@ def test_mixed_integer(self): tm.assert_numpy_array_equal(result, expected) values = np.array(["b", 1, 0, "a"], dtype=object) - labels = [0, 1, 2, 3, 0, -1, 1] - result, result_labels = safe_sort(values, labels) + codes = [0, 1, 2, 3, 0, -1, 1] + result, result_codes = safe_sort(values, codes) expected = np.array([0, 1, "a", "b"], dtype=object) - expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - tm.assert_numpy_array_equal(result_labels, expected_labels) + tm.assert_numpy_array_equal(result_codes, expected_codes) def test_mixed_integer_from_list(self): values = ["b", 1, 0, "a", 0, "b"] @@ -428,10 +428,10 @@ def test_exceptions(self): safe_sort(values=1) with pytest.raises(TypeError, match="Only list-like objects or None"): - safe_sort(values=[0, 1, 2], labels=1) + safe_sort(values=[0, 1, 2], codes=1) with pytest.raises(ValueError, match="values should be unique"): - safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + safe_sort(values=[0, 1, 2, 1], codes=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') @@ -443,12 +443,12 @@ def test_extension_array(self): @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_extension_array_labels(self, verify, na_sentinel): + def test_extension_array_codes(self, verify, na_sentinel): a = array([1, 3, 2], dtype="Int64") - result, labels = safe_sort( + result, codes = safe_sort( a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify ) expected_values = array([1, 2, 3], dtype="Int64") - expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..7f3375070d7d9 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -8,8 +8,8 @@ from pandas._libs import lib from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna +import pandas._testing as tm import pandas.core.strings as strings -import pandas.util.testing as tm def assert_series_or_index_equal(left, right): @@ -202,9 +202,9 @@ def test_api_mi_raises(self): assert not hasattr(mi, "str") @pytest.mark.parametrize("dtype", [object, "category"]) - @pytest.mark.parametrize("box", [Series, Index]) - def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): + def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture + box = index_or_series inferred_dtype, values = any_skipna_inferred_dtype t = box(values, dtype=dtype) # explicit dtype to avoid casting @@ -236,13 +236,17 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): assert not hasattr(t, "str") @pytest.mark.parametrize("dtype", [object, "category"]) - @pytest.mark.parametrize("box", [Series, Index]) def test_api_per_method( - self, box, dtype, any_allowed_skipna_inferred_dtype, any_string_method + self, + index_or_series, + dtype, + any_allowed_skipna_inferred_dtype, + any_string_method, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others + box = index_or_series # one instance of each parametrized fixture inferred_dtype, values = any_allowed_skipna_inferred_dtype @@ -292,10 +296,8 @@ def test_api_per_method( else: # GH 23011, GH 23163 msg = ( - "Cannot use .str.{name} with values of inferred dtype " - "{inferred_dtype!r}.".format( - name=method_name, inferred_dtype=inferred_dtype - ) + f"Cannot use .str.{method_name} with values of " + f"inferred dtype {repr(inferred_dtype)}." ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) @@ -325,17 +327,18 @@ def test_iter(self): strs = "google", "wikimedia", "wikipedia", "wikitravel" ds = Series(strs) - for s in ds.str: - # iter must yield a Series - assert isinstance(s, Series) + with tm.assert_produces_warning(FutureWarning): + for s in ds.str: + # iter must yield a Series + assert isinstance(s, Series) - # indices of each yielded Series should be equal to the index of - # the original Series - tm.assert_index_equal(s.index, ds.index) + # indices of each yielded Series should be equal to the index of + # the original Series + tm.assert_index_equal(s.index, ds.index) - for el in s: - # each element of the series is either a basestring/str or nan - assert isinstance(el, str) or isna(el) + for el in s: + # each element of the series is either a basestring/str or nan + assert isinstance(el, str) or isna(el) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in @@ -347,8 +350,9 @@ def test_iter_empty(self): i, s = 100, 1 - for i, s in enumerate(ds.str): - pass + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass # nothing to iterate over so nothing defined values should remain # unchanged @@ -358,8 +362,9 @@ def test_iter_empty(self): def test_iter_single_element(self): ds = Series(["a"]) - for i, s in enumerate(ds.str): - pass + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass assert not i tm.assert_series_equal(ds, s) @@ -369,16 +374,17 @@ def test_iter_object_try_string(self): i, s = 100, "h" - for i, s in enumerate(ds.str): - pass + with tm.assert_produces_warning(FutureWarning): + for i, s in enumerate(ds.str): + pass assert i == 100 assert s == "h" - @pytest.mark.parametrize("box", [Series, Index]) @pytest.mark.parametrize("other", [None, Series, Index]) - def test_str_cat_name(self, box, other): + def test_str_cat_name(self, index_or_series, other): # GH 21053 + box = index_or_series values = ["a", "b"] if other: other = other(values) @@ -387,8 +393,8 @@ def test_str_cat_name(self, box, other): result = box(values, name="name").str.cat(other, sep=",") assert result.name == "name" - @pytest.mark.parametrize("box", [Series, Index]) - def test_str_cat(self, box): + def test_str_cat(self, index_or_series): + box = index_or_series # test_cat above tests "str_cat" from ndarray; # here testing "str.cat" from Series/Indext to ndarray/list s = box(["a", "a", "b", "b", "c", np.nan]) @@ -427,9 +433,9 @@ def test_str_cat(self, box): with pytest.raises(ValueError, match=rgx): s.str.cat(list(z)) - @pytest.mark.parametrize("box", [Series, Index]) - def test_str_cat_raises_intuitive_error(self, box): + def test_str_cat_raises_intuitive_error(self, index_or_series): # GH 11334 + box = index_or_series s = box(["a", "b", "c", "d"]) message = "Did you mean to supply a `sep` keyword?" with pytest.raises(ValueError, match=message): @@ -440,8 +446,11 @@ def test_str_cat_raises_intuitive_error(self, box): @pytest.mark.parametrize("sep", ["", None]) @pytest.mark.parametrize("dtype_target", ["object", "category"]) @pytest.mark.parametrize("dtype_caller", ["object", "category"]) - @pytest.mark.parametrize("box", [Series, Index]) - def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): + def test_str_cat_categorical( + self, index_or_series, dtype_caller, dtype_target, sep + ): + box = index_or_series + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) s = s if box == Index else Series(s, index=s) t = Index(["b", "a", "b", "c"], dtype=dtype_target) @@ -494,8 +503,8 @@ def test_str_cat_wrong_dtype_raises(self, box, data): # need to use outer and na_rep, as otherwise Index would not raise s.str.cat(t, join="outer", na_rep="-") - @pytest.mark.parametrize("box", [Series, Index]) - def test_str_cat_mixed_inputs(self, box): + def test_str_cat_mixed_inputs(self, index_or_series): + box = index_or_series s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) @@ -596,9 +605,10 @@ def test_str_cat_mixed_inputs(self, box): s.str.cat(iter([t.values, list(s)])) @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) - @pytest.mark.parametrize("box", [Series, Index]) - def test_str_cat_align_indexed(self, box, join): + def test_str_cat_align_indexed(self, index_or_series, join): # https://github.com/pandas-dev/pandas/issues/18657 + box = index_or_series + s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) sa, ta = s.align(t, join=join) @@ -656,10 +666,14 @@ def test_str_cat_align_mixed_inputs(self, join): with pytest.raises(ValueError, match=rgx): s.str.cat([t, z], join=join) - @pytest.mark.parametrize("box", [Series, Index]) - @pytest.mark.parametrize("other", [Series, Index]) - def test_str_cat_all_na(self, box, other): + index_or_series2 = [Series, Index] # type: ignore + # List item 0 has incompatible type "Type[Series]"; expected "Type[PandasObject]" + # See GH#29725 + + @pytest.mark.parametrize("other", index_or_series2) + def test_str_cat_all_na(self, index_or_series, other): # GH 24044 + box = index_or_series # check that all NaNs in caller / target work s = Index(["a", "b", "c", "d"]) @@ -731,7 +745,10 @@ def test_count(self): tm.assert_series_equal(result, exp) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_count(mixed, "a") xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) @@ -755,14 +772,14 @@ def test_contains(self): expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ["foo", "xyz", "fooommm__foo", "mmm_"] + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -773,7 +790,10 @@ def test_contains(self): tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_contains(mixed, "o") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], @@ -869,7 +889,10 @@ def test_endswith(self): tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_endswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], @@ -1802,7 +1825,7 @@ def test_extractall_same_as_extract_subject_index(self): def test_empty_str_methods(self): empty_str = empty = Series(dtype=object) - empty_int = Series(dtype=int) + empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) empty_bytes = Series(dtype=object) @@ -1853,15 +1876,16 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) - tm.assert_series_equal(empty_str, empty.str.isalnum()) - tm.assert_series_equal(empty_str, empty.str.isalpha()) - tm.assert_series_equal(empty_str, empty.str.isdigit()) - tm.assert_series_equal(empty_str, empty.str.isspace()) - tm.assert_series_equal(empty_str, empty.str.islower()) - tm.assert_series_equal(empty_str, empty.str.isupper()) - tm.assert_series_equal(empty_str, empty.str.istitle()) - tm.assert_series_equal(empty_str, empty.str.isnumeric()) - tm.assert_series_equal(empty_str, empty.str.isdecimal()) + # ismethods should always return boolean (GH 29624) + tm.assert_series_equal(empty_bool, empty.str.isalnum()) + tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isdigit()) + tm.assert_series_equal(empty_bool, empty.str.isspace()) + tm.assert_series_equal(empty_bool, empty.str.islower()) + tm.assert_series_equal(empty_bool, empty.str.isupper()) + tm.assert_series_equal(empty_bool, empty.str.istitle()) + tm.assert_series_equal(empty_bool, empty.str.isnumeric()) + tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) @@ -2833,7 +2857,8 @@ def test_partition_index(self): result = values.str.partition("_", expand=False) exp = Index( np.array( - [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None] + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None], + dtype=object, ) ) tm.assert_index_equal(result, exp) @@ -2842,7 +2867,8 @@ def test_partition_index(self): result = values.str.rpartition("_", expand=False) exp = Index( np.array( - [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None] + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None], + dtype=object, ) ) tm.assert_index_equal(result, exp) @@ -2946,23 +2972,17 @@ def test_partition_with_name(self): assert res.nlevels == 1 tm.assert_index_equal(res, exp) - def test_partition_deprecation(self): + def test_partition_sep_kwarg(self): # GH 22676; depr kwarg "pat" in favor of "sep" values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - # str.partition - # using sep -> no warning expected = values.str.partition(sep="_") - with tm.assert_produces_warning(FutureWarning): - result = values.str.partition(pat="_") - tm.assert_frame_equal(result, expected) + result = values.str.partition("_") + tm.assert_frame_equal(result, expected) - # str.rpartition - # using sep -> no warning expected = values.str.rpartition(sep="_") - with tm.assert_produces_warning(FutureWarning): - result = values.str.rpartition(pat="_") - tm.assert_frame_equal(result, expected) + result = values.str.rpartition("_") + tm.assert_frame_equal(result, expected) def test_pipe_failures(self): # #2119 @@ -3488,10 +3508,13 @@ def test_casefold(self): def test_string_array(any_string_method): + method_name, args, kwargs = any_string_method + if method_name == "decode": + pytest.skip("decode requires bytes.") + data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") - method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) @@ -3502,8 +3525,51 @@ def test_string_array(any_string_method): ): assert result.dtype == "string" result = result.astype(object) + + elif expected.dtype == "object" and lib.is_bool_array( + expected.values, skipna=True + ): + assert result.dtype == "boolean" + result = result.astype(object) + + elif expected.dtype == "float" and expected.isna().any(): + assert result.dtype == "Int64" + result = result.astype("float") + elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("count", [2, None]), + ("find", [0, None]), + ("index", [0, None]), + ("rindex", [2, None]), + ], +) +def test_string_array_numeric_integer_array(method, expected): + s = Series(["aba", None], dtype="string") + result = getattr(s.str, method)("a") + expected = Series(expected, dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "method,expected", + [ + ("isdigit", [False, None, True]), + ("isalpha", [True, None, False]), + ("isalnum", [True, None, True]), + ("isdigit", [False, None, True]), + ], +) +def test_string_array_boolean_array(method, expected): + s = Series(["a", None, "1"], dtype="string") + result = getattr(s.str, method)() + expected = Series(expected, dtype="boolean") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index d2a9e1dc94bb5..465296a6f9e51 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -6,8 +6,8 @@ from pandas._libs.tslib import iNaT +import pandas._testing as tm import pandas.core.algorithms as algos -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 55f83e492e2cc..2fd39d5a7b703 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, to_numeric -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[None, "ignore", "raise", "coerce"]) @@ -567,6 +567,24 @@ def test_downcast_limits(dtype, downcast, min_max): assert series.dtype == dtype +@pytest.mark.parametrize( + "ser,expected", + [ + ( + pd.Series([0, 9223372036854775808]), + pd.Series([0, 9223372036854775808], dtype=np.uint64), + ) + ], +) +def test_downcast_uint64(ser, expected): + # see gh-14422: + # BUG: to_numeric doesn't work uint64 numbers + + result = pd.to_numeric(ser, downcast="unsigned") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "data,exp_data", [ diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 50844aabb2c88..c4660417599a8 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -8,8 +8,8 @@ from pandas.compat import is_platform_windows from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range +import pandas._testing as tm from pandas.core.tools.datetimes import to_datetime -import pandas.util.testing as tm import pandas.tseries.frequencies as frequencies import pandas.tseries.offsets as offsets @@ -468,7 +468,7 @@ def test_series_datetime_index(freq): @pytest.mark.parametrize( "offset_func", [ - frequencies.get_offset, + frequencies._get_offset, lambda freq: date_range("2011-01-01", periods=5, freq=freq), ], ) @@ -528,8 +528,8 @@ def test_legacy_offset_warnings(offset_func, freq): def test_ms_vs_capital_ms(): - left = frequencies.get_offset("ms") - right = frequencies.get_offset("MS") + left = frequencies._get_offset("ms") + right = frequencies._get_offset("MS") assert left == offsets.Milli() assert right == offsets.MonthBegin() diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index c122f92ed228c..5b4a7c74b1af1 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -3,7 +3,7 @@ import pytest from pandas import DatetimeIndex, offsets, to_datetime -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( AbstractHolidayCalendar, diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 06869fcd7a4f8..a2c146dbd65e8 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -3,7 +3,7 @@ import pytest from pytz import utc -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( MO, @@ -238,7 +238,7 @@ class TestCalendar(AbstractHolidayCalendar): rules = [] calendar = get_calendar("TestCalendar") - assert TestCalendar == calendar.__class__ + assert TestCalendar == type(calendar) def test_factory(): diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index fbf4454109ec0..71953fd095882 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -13,18 +13,14 @@ def assert_offset_equal(offset, base, expected): assert actual_apply == expected except AssertionError: raise AssertionError( - "\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" - "\nAt Date: {base}".format( - expected=expected, actual=actual, offset=offset, base=base - ) + f"\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + f"\nAt Date: {base}" ) -def assert_onOffset(offset, date, expected): - actual = offset.onOffset(date) +def assert_is_on_offset(offset, date, expected): + actual = offset.is_on_offset(date) assert actual == expected, ( - "\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" - "\nAt Date: {date}".format( - expected=expected, actual=actual, offset=offset, date=date - ) + f"\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" + f"\nAt Date: {date}" ) diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index 8b1aaafb94e0b..5686119593e18 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -9,11 +9,12 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas import Timestamp +import pandas._testing as tm from pandas.tseries.frequencies import get_offset from pandas.tseries.offsets import FY5253, FY5253Quarter -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal from .test_offsets import Base, WeekDay @@ -50,9 +51,11 @@ def test_get_offset_name(): def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("gibberish") + with tm.assert_produces_warning(FutureWarning): + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("QS-JAN-B") + with tm.assert_produces_warning(FutureWarning): + get_offset("QS-JAN-B") pairs = [ ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), @@ -78,11 +81,11 @@ def test_get_offset(): ] for name, expected in pairs: - offset = get_offset(name) - assert ( - offset == expected - ), "Expected {name!r} to yield {expected!r} (actual: {offset!r})".format( - name=name, expected=expected, offset=offset + with tm.assert_produces_warning(FutureWarning): + offset = get_offset(name) + assert offset == expected, ( + f"Expected {repr(name)} to yield {repr(expected)} " + f"(actual: {repr(offset)})" ) @@ -129,9 +132,9 @@ class TestFY5253LastOfMonth(Base): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_apply(self): offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) @@ -254,9 +257,9 @@ def test_get_year_end(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_apply(self): date_seq_nem_8_sat = [ @@ -330,16 +333,16 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter(Base): - def test_isAnchored(self): + def test_is_anchored(self): assert makeFY5253LastOfMonthQuarter( startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() assert makeFY5253LastOfMonthQuarter( weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() assert not makeFY5253LastOfMonthQuarter( 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( @@ -492,9 +495,9 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_year_has_extra_week(self): # End of long Q1 @@ -597,9 +600,9 @@ class TestFY5253NearestEndMonthQuarter(Base): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_offset(self): offset = makeFY5253NearestEndMonthQuarter( @@ -653,7 +656,7 @@ def test_fy5253_last_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0) ts = Timestamp("1984-05-28 06:29:43.955911354+0200", tz="Europe/San_Marino") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -662,7 +665,7 @@ def test_fy5253_nearest_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2) ts = Timestamp("2032-07-28 00:12:59.035729419+0000", tz="Africa/Dakar") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -673,7 +676,7 @@ def test_fy5253qtr_onoffset_nearest(): offset = FY5253Quarter( n=3, qtr_with_extra_week=1, startingMonth=2, variation="nearest", weekday=0 ) - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -685,5 +688,5 @@ def test_fy5253qtr_onoffset_last(): ) ts = Timestamp("2011-01-26 19:03:40.331096129+0200", tz="Africa/Windhoek") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 81aff4211440e..2f00a58fe80be 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,5 +1,5 @@ from datetime import date, datetime, time as dt_time, timedelta -from typing import Type +from typing import Dict, List, Optional, Tuple, Type import numpy as np import pytest @@ -20,17 +20,19 @@ from pandas._libs.tslibs.offsets import ApplyTypeError import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat +from pandas.errors import PerformanceWarning -from pandas.core.indexes.datetimes import DatetimeIndex, _to_M8, date_range +import pandas._testing as tm +from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.series import Series -import pandas.util.testing as tm from pandas.io.pickle import read_pickle -from pandas.tseries.frequencies import _offset_map, get_offset +from pandas.tseries.frequencies import _get_offset, _offset_map from pandas.tseries.holiday import USFederalHolidayCalendar import pandas.tseries.offsets as offsets from pandas.tseries.offsets import ( FY5253, + BaseOffset, BDay, BMonthBegin, BMonthEnd, @@ -42,7 +44,10 @@ CBMonthBegin, CBMonthEnd, CDay, + CustomBusinessDay, CustomBusinessHour, + CustomBusinessMonthBegin, + CustomBusinessMonthEnd, DateOffset, Day, Easter, @@ -62,7 +67,7 @@ YearEnd, ) -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal class WeekDay: @@ -76,24 +81,14 @@ class WeekDay: SUN = 6 -#### -# Misc function tests -#### - - -def test_to_M8(): - valb = datetime(2007, 10, 1) - valu = _to_M8(valb) - assert isinstance(valu, np.datetime64) - - ##### # DateOffset Tests ##### +_ApplyCases = List[Tuple[BaseOffset, Dict[datetime, datetime]]] class Base: - _offset = None # type: Type[DateOffset] + _offset: Optional[Type[DateOffset]] = None d = Timestamp(datetime(2008, 1, 2)) timezones = [ @@ -331,7 +326,7 @@ def test_offset_freqstr(self, offset_types): freqstr = offset.freqstr if freqstr not in ("", "", "LWOM-SAT"): - code = get_offset(freqstr) + code = _get_offset(freqstr) assert offset.rule_code == code def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): @@ -356,7 +351,7 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=Fals ts = Timestamp(dt) + Nano(5) if ( - offset_s.__class__.__name__ == "DateOffset" + type(offset_s).__name__ == "DateOffset" and (funcname == "apply" or normalize) and ts.nanosecond > 0 ): @@ -393,7 +388,7 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=Fals ts = Timestamp(dt, tz=tz) + Nano(5) if ( - offset_s.__class__.__name__ == "DateOffset" + type(offset_s).__name__ == "DateOffset" and (funcname == "apply" or normalize) and ts.nanosecond > 0 ): @@ -551,24 +546,24 @@ def test_rollback(self, offset_types): offset_types, "rollback", dt, expected, normalize=True ) - def test_onOffset(self, offset_types): + def test_is_on_offset(self, offset_types): dt = self.expecteds[offset_types.__name__] offset_s = self._get_offset(offset_types) - assert offset_s.onOffset(dt) + assert offset_s.is_on_offset(dt) - # when normalize=True, onOffset checks time is 00:00:00 + # when normalize=True, is_on_offset checks time is 00:00:00 if issubclass(offset_types, Tick): # normalize=True disallowed for Tick subclasses GH#21427 return offset_n = self._get_offset(offset_types, normalize=True) - assert not offset_n.onOffset(dt) + assert not offset_n.is_on_offset(dt) if offset_types in (BusinessHour, CustomBusinessHour): # In default BusinessHour (9:00-17:00), normalized time # cannot be in business hour range return date = datetime(dt.year, dt.month, dt.day) - assert offset_n.onOffset(date) + assert offset_n.is_on_offset(date) def test_add(self, offset_types, tz_naive_fixture): tz = tz_naive_fixture @@ -605,6 +600,46 @@ def test_add(self, offset_types, tz_naive_fixture): assert isinstance(result, Timestamp) assert result == expected_localize + def test_add_empty_datetimeindex(self, offset_types, tz_naive_fixture): + # GH#12724, GH#30336 + offset_s = self._get_offset(offset_types) + + dti = DatetimeIndex([], tz=tz_naive_fixture) + + warn = None + if isinstance( + offset_s, + ( + Easter, + WeekOfMonth, + LastWeekOfMonth, + CustomBusinessDay, + BusinessHour, + CustomBusinessHour, + CustomBusinessMonthBegin, + CustomBusinessMonthEnd, + FY5253, + FY5253Quarter, + ), + ): + # We don't have an optimized apply_index + warn = PerformanceWarning + + with tm.assert_produces_warning(warn): + result = dti + offset_s + tm.assert_index_equal(result, dti) + with tm.assert_produces_warning(warn): + result = offset_s + dti + tm.assert_index_equal(result, dti) + + dta = dti._data + with tm.assert_produces_warning(warn): + result = dta + offset_s + tm.assert_equal(result, dta) + with tm.assert_produces_warning(warn): + result = offset_s + dta + tm.assert_equal(result, dta) + def test_pickle_v0_15_2(self, datapath): offsets = { "DateOffset": DateOffset(years=1), @@ -620,6 +655,27 @@ def test_pickle_v0_15_2(self, datapath): # tm.assert_dict_equal(offsets, read_pickle(pickle_path)) + def test_onOffset_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + ts = Timestamp.now() + with tm.assert_produces_warning(FutureWarning): + result = off.onOffset(ts) + + expected = off.is_on_offset(ts) + assert result == expected + + def test_isAnchored_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + with tm.assert_produces_warning(FutureWarning): + result = off.isAnchored() + + expected = off.is_anchored() + assert result == expected + class TestDateOffset(Base): def setup_method(self, method): @@ -643,8 +699,8 @@ def test_constructor(self): assert (self.d + DateOffset(2)) == datetime(2008, 1, 4) - assert not DateOffset(2).isAnchored() - assert DateOffset(1).isAnchored() + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() d = datetime(2008, 1, 31) assert (d + DateOffset(months=1)) == datetime(2008, 2, 29) @@ -732,16 +788,16 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): + def test_is_on_offset(self): tests = [ (BDay(), datetime(2008, 1, 1), True), (BDay(), datetime(2008, 1, 5), False), ] for offset, d, expected in tests: - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) - apply_cases = [] + apply_cases: _ApplyCases = [] apply_cases.append( ( BDay(), @@ -1227,10 +1283,10 @@ def test_normalize(self, case): ) @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, cases = case for dt, expected in cases.items(): - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected opening_time_cases = [] # opening time should be affected by sign of n, not by n's value and @@ -2426,7 +2482,7 @@ def test_normalize(self, norm_cases): for dt, expected in cases.items(): assert offset.apply(dt) == expected - def test_onOffset(self): + def test_is_on_offset(self): tests = [] tests.append( @@ -2445,7 +2501,7 @@ def test_onOffset(self): for offset, cases in tests: for dt, expected in cases.items(): - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected apply_cases = [] apply_cases.append( @@ -2625,11 +2681,11 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, d, expected = case - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) - apply_cases = [] + apply_cases: _ApplyCases = [] apply_cases.append( ( CDay(), @@ -2725,8 +2781,8 @@ def test_apply_large_n(self): def test_apply_corner(self): msg = ( - "Only know how to combine trading day with datetime, datetime64" - " or timedelta" + "Only know how to combine trading day " + "with datetime, datetime64 or timedelta" ) with pytest.raises(ApplyTypeError, match=msg): CDay().apply(BMonthEnd()) @@ -2872,11 +2928,11 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, d, expected = case - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) - apply_cases = [] + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthEnd(), @@ -3021,11 +3077,11 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) - apply_cases = [] + apply_cases: _ApplyCases = [] apply_cases.append( ( CBMonthBegin(), @@ -3138,11 +3194,11 @@ def test_corner(self): with pytest.raises(ValueError, match="Day must be"): Week(weekday=-1) - def test_isAnchored(self): - assert Week(weekday=0).isAnchored() - assert not Week().isAnchored() - assert not Week(2, weekday=2).isAnchored() - assert not Week(2).isAnchored() + def test_is_anchored(self): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week @@ -3206,7 +3262,7 @@ def test_offset(self, case): assert_offset_equal(offset, base, expected) @pytest.mark.parametrize("weekday", range(7)) - def test_onOffset(self, weekday): + def test_is_on_offset(self, weekday): offset = Week(weekday=weekday) for day in range(1, 8): @@ -3216,7 +3272,7 @@ def test_onOffset(self, weekday): expected = True else: expected = False - assert_onOffset(offset, date, expected) + assert_is_on_offset(offset, date, expected) class TestWeekOfMonth(Base): @@ -3313,10 +3369,10 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): week, weekday, dt, expected = case offset = WeekOfMonth(week=week, weekday=weekday) - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected class TestLastWeekOfMonth(Base): @@ -3390,10 +3446,10 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): weekday, dt, expected = case offset = LastWeekOfMonth(weekday=weekday) - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected class TestSemiMonthEnd(Base): @@ -3600,9 +3656,9 @@ def test_apply_index(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): dt, expected = case - assert_onOffset(SemiMonthEnd(), dt, expected) + assert_is_on_offset(SemiMonthEnd(), dt, expected) @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): @@ -3864,9 +3920,9 @@ def test_apply_index(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): dt, expected = case - assert_onOffset(SemiMonthBegin(), dt, expected) + assert_is_on_offset(SemiMonthBegin(), dt, expected) @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): @@ -3949,9 +4005,9 @@ def test_get_offset_name(self): def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("gibberish") + _get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("QS-JAN-B") + _get_offset("QS-JAN-B") pairs = [ ("B", BDay()), @@ -3966,11 +4022,10 @@ def test_get_offset(): ] for name, expected in pairs: - offset = get_offset(name) - assert ( - offset == expected - ), "Expected {name!r} to yield {expected!r} (actual: {offset!r})".format( - name=name, expected=expected, offset=offset + offset = _get_offset(name) + assert offset == expected, ( + f"Expected {repr(name)} to yield {repr(expected)} " + f"(actual: {repr(offset)})" ) @@ -3978,7 +4033,7 @@ def test_get_offset_legacy(): pairs = [("w@Sat", Week(weekday=5))] for name, expected in pairs: with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset(name) + _get_offset(name) class TestOffsetAliases: @@ -3994,17 +4049,17 @@ def test_alias_equality(self): def test_rule_code(self): lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] for k in lst: - assert k == get_offset(k).rule_code + assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... assert k in _offset_map - assert k == (get_offset(k) * 3).rule_code + assert k == (_get_offset(k) * 3).rule_code suffix_lst = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] base = "W" for v in suffix_lst: alias = "-".join([base, v]) - assert alias == get_offset(alias).rule_code - assert alias == (get_offset(alias) * 5).rule_code + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code suffix_lst = [ "JAN", @@ -4024,8 +4079,8 @@ def test_rule_code(self): for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) - assert alias == get_offset(alias).rule_code - assert alias == (get_offset(alias) * 5).rule_code + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code lst = ["M", "D", "B", "H", "T", "S", "L", "U"] for k in lst: @@ -4078,7 +4133,7 @@ def test_str_for_named_is_name(self): names += ["WOM-" + week + day for week in ("1", "2", "3", "4") for day in days] _offset_map.clear() for name in names: - offset = get_offset(name) + offset = _get_offset(name) assert offset.freqstr == name @@ -4168,9 +4223,9 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: - offset_string = "{hrs:02d}00".format(hrs=hrs_offset) + offset_string = f"{hrs_offset:02d}00" else: - offset_string = "-{hrs:02d}00".format(hrs=-1 * hrs_offset) + offset_string = f"-{(hrs_offset * -1):02}00" return Timestamp(string + offset_string).tz_convert(tz) def test_springforward_plural(self): @@ -4251,7 +4306,7 @@ def test_valid_default_arguments(offset_types): cls() -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_month_attributes(kwd, month_classes): # GH#18226 cls = month_classes @@ -4260,14 +4315,14 @@ def test_valid_month_attributes(kwd, month_classes): cls(**{kwd: 3}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_relativedelta_kwargs(kwd): # Check that all the arguments specified in liboffsets.relativedelta_kwds # are in fact valid relativedelta keyword args DateOffset(**{kwd: 1}) -@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(liboffsets.relativedelta_kwds)) def test_valid_tick_attributes(kwd, tick_classes): # GH#18226 cls = tick_classes @@ -4303,34 +4358,34 @@ def test_tick_normalize_raises(tick_classes): def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always - # be onOffset + # be is_on_offset offset = Week(n=2, weekday=None) ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = Week(n=2, weekday=None) ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow def test_weekofmonth_onoffset(): # GH#18864 - # Make sure that nanoseconds don't trip up onOffset (and with it apply) + # Make sure that nanoseconds don't trip up is_on_offset (and with it apply) offset = WeekOfMonth(n=2, week=2, weekday=0) ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = WeekOfMonth(n=-3, week=1, weekday=0) ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -4340,14 +4395,14 @@ def test_last_week_of_month_on_offset(): offset = LastWeekOfMonth(n=4, weekday=6) ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow # negative n offset = LastWeekOfMonth(n=-4, weekday=5) ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 351f0f9ad3b5b..716d3ff3faf1c 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -90,11 +90,11 @@ @given(gen_random_datetime, gen_yqm_offset) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) - # check that the class-specific implementations of onOffset match + # check that the class-specific implementations of is_on_offset match # the general case definition: # (dt + offset) - offset == dt compare = (dt + offset) - offset - assert offset.onOffset(dt) == (compare == dt) + assert offset.is_on_offset(dt) == (compare == dt) @pytest.mark.xfail( diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 98a3631c8e63a..297e5c3178379 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -8,7 +8,7 @@ import pytest from pandas import Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries import offsets from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second @@ -284,7 +284,7 @@ def test_tick_equalities(cls): @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().isAnchored() + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index 12a524d82fcf5..79a0e0f2c25eb 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -23,7 +23,7 @@ YearEnd, ) -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal from .test_offsets import Base # -------------------------------------------------------------------- @@ -85,7 +85,7 @@ def test_on_offset(offset): if not (m == 11 and d == 31) ] for date in dates: - res = offset.onOffset(date) + res = offset.is_on_offset(date) slow_version = date == (date + offset) - offset assert res == slow_version @@ -247,9 +247,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBMonthBegin(Base): @@ -335,9 +335,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBMonthEnd(Base): @@ -424,9 +424,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) # -------------------------------------------------------------------- @@ -442,10 +442,10 @@ def test_repr(self): expected = "" assert repr(QuarterBegin(startingMonth=1)) == expected - def test_isAnchored(self): - assert QuarterBegin(startingMonth=1).isAnchored() - assert QuarterBegin().isAnchored() - assert not QuarterBegin(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -552,10 +552,10 @@ def test_repr(self): expected = "" assert repr(QuarterEnd(startingMonth=1)) == expected - def test_isAnchored(self): - assert QuarterEnd(startingMonth=1).isAnchored() - assert QuarterEnd().isAnchored() - assert not QuarterEnd(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -683,9 +683,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBQuarterBegin(Base): @@ -699,10 +699,10 @@ def test_repr(self): expected = "" assert repr(BQuarterBegin(startingMonth=1)) == expected - def test_isAnchored(self): - assert BQuarterBegin(startingMonth=1).isAnchored() - assert BQuarterBegin().isAnchored() - assert not BQuarterBegin(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -824,10 +824,10 @@ def test_repr(self): expected = "" assert repr(BQuarterEnd(startingMonth=1)) == expected - def test_isAnchored(self): - assert BQuarterEnd(startingMonth=1).isAnchored() - assert BQuarterEnd().isAnchored() - assert not BQuarterEnd(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -951,9 +951,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) # -------------------------------------------------------------------- @@ -1109,9 +1109,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestYearEnd(Base): @@ -1186,9 +1186,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestYearEndDiffMonth(Base): @@ -1258,9 +1258,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBYearBegin(Base): @@ -1404,9 +1404,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBYearEndLagged(Base): @@ -1459,6 +1459,6 @@ def test_roll(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 5cf2165993cd7..a40fcd725d604 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -9,7 +9,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6c30e2b6c7a1c..2beeae85de683 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -8,7 +8,7 @@ from pandas._libs.tslibs import conversion, timezones, tzconversion from pandas import Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _compare_utc_to_local(tz_didx): diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index cd729956a027c..943f4207df543 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -2,7 +2,7 @@ from pandas._libs.tslibs import fields -import pandas.util.testing as tm +import pandas._testing as tm def test_fields_readonly(): diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index a6e7aee46b485..a58f227c20c7f 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -59,9 +59,7 @@ def test_parsers_iso8601_invalid(date_str): def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = "Timezone hours offset out of range " 'in datetime string "{s}"'.format( - s=date_str - ) + msg = f'Timezone hours offset out of range in datetime string "{date_str}"' with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 0bc30347b3fa9..36f7ada7326bf 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -11,7 +11,7 @@ from pandas._libs.tslibs.parsing import parse_time_string import pandas.util._test_decorators as td -import pandas.util.testing as tm +import pandas._testing as tm def test_parse_time_string(): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index f430e2893ca33..b8048891e4876 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def _assert_almost_equal_both(a, b, **kwargs): @@ -39,9 +39,7 @@ def _assert_not_almost_equal(a, b, **kwargs): """ try: tm.assert_almost_equal(a, b, **kwargs) - msg = ( - "{a} and {b} were approximately equal when they shouldn't have been" - ).format(a=a, b=b) + msg = f"{a} and {b} were approximately equal when they shouldn't have been" pytest.fail(msg=msg) except AssertionError: pass @@ -248,13 +246,12 @@ def test_assert_almost_equal_value_mismatch(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_almost_equal(a, b) diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 44400498ddc64..8957e7a172666 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -77,13 +77,11 @@ def test_categorical_equal_ordered_mismatch(): @pytest.mark.parametrize("obj", ["index", "foo", "pandas"]) def test_categorical_equal_object_override(obj): data = [1, 2, 3, 4] - msg = """{obj} are different + msg = f"""{obj} are different Attribute "ordered" are different \\[left\\]: False -\\[right\\]: True""".format( - obj=obj - ) +\\[right\\]: True""" c1 = Categorical(data, ordered=False) c2 = Categorical(data, ordered=True) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index cecf9273004d7..0547323b882f6 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,8 +1,8 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -96,7 +96,7 @@ def test_assert_extension_array_equal_non_extension_array(side): numpy_array = np.arange(5) extension_array = SparseArray(numpy_array) - msg = "{side} is not an ExtensionArray".format(side=side) + msg = f"{side} is not an ExtensionArray" args = ( (numpy_array, extension_array) if side == "left" diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index b46a8460a28b2..23c845f2b2795 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -80,7 +80,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. - msg = "{obj}.index are different".format(obj=obj_fixture) + msg = f"{obj_fixture}.index are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) else: @@ -95,7 +95,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): ], ) def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): - msg = "{obj} are different".format(obj=obj_fixture) + msg = f"{obj_fixture} are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, obj=obj_fixture) @@ -149,13 +149,11 @@ def test_empty_dtypes(check_dtype): def test_frame_equal_index_mismatch(obj_fixture): - msg = """{obj}\\.index are different + msg = f"""{obj_fixture}\\.index are different -{obj}\\.index values are different \\(33\\.33333 %\\) +{obj_fixture}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) @@ -165,13 +163,11 @@ def test_frame_equal_index_mismatch(obj_fixture): def test_frame_equal_columns_mismatch(obj_fixture): - msg = """{obj}\\.columns are different + msg = f"""{obj_fixture}\\.columns are different -{obj}\\.columns values are different \\(50\\.0 %\\) +{obj_fixture}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) @@ -181,13 +177,12 @@ def test_frame_equal_columns_mismatch(obj_fixture): def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): - msg = """{obj}\\.iloc\\[:, 1\\] are different + obj = obj_fixture + msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""".format( - obj=obj_fixture - ) +\\[right\\]: \\[4, 5, 7\\]""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) @@ -202,18 +197,18 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """{obj}\\.iloc\\[:, 1\\] are different + """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]""", ), ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """{obj}\\.iloc\\[:, 0\\] are different + """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different -{obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]""", ), diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 8c3f242f0c96b..bbbeebcec2569 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index, MultiIndex, NaT -import pandas.util.testing as tm +import pandas._testing as tm def test_index_equal_levels_mismatch(): @@ -135,11 +135,6 @@ def test_index_equal_level_values_mismatch(check_exact, check_less_precise): [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], ) def test_index_equal_names(name1, name2): - msg = """Index are different - -Attribute "names" are different -\\[left\\]: \\[{name1}\\] -\\[right\\]: \\[{name2}\\]""" idx1 = Index([1, 2, 3], name=name1) idx2 = Index([1, 2, 3], name=name2) @@ -149,7 +144,11 @@ def test_index_equal_names(name1, name2): else: name1 = "'x'" if name1 == "x" else name1 name2 = "'x'" if name2 == "x" else name2 - msg = msg.format(name1=name1, name2=name2) + msg = f"""Index are different + +Attribute "names" are different +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_index_equal(idx1, idx2) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index b264b484a04ab..96f2973a1528c 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index 53bcedf3a16f1..c8ae9ebdd8651 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_assert_numpy_array_equal_shape_mismatch(): @@ -28,13 +28,11 @@ def test_assert_numpy_array_equal_bad_type(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(a, b) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index c681817896903..87765c909938d 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -2,7 +2,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm def f(): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0a6047c4662ba..eaf0824f52927 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical, DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm def _assert_series_equal_both(a, b, **kwargs): diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index 8fbc8037ed7c5..ee4f7e3f34f2e 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -4,7 +4,7 @@ from pandas.util._decorators import deprecate -import pandas.util.testing as tm +import pandas._testing as tm def new_func(): diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py index c17c48197ccf7..b165e9fba0e4f 100644 --- a/pandas/tests/util/test_deprecate_kwarg.py +++ b/pandas/tests/util/test_deprecate_kwarg.py @@ -2,7 +2,7 @@ from pandas.util._decorators import deprecate_kwarg -import pandas.util.testing as tm +import pandas._testing as tm @deprecate_kwarg("old", "new") diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index df3c7fe9c9936..c915edad4bb8e 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples from pandas.util import hash_array, hash_pandas_object -import pandas.util.testing as tm @pytest.fixture( @@ -207,7 +207,7 @@ def test_multiindex_objects(): Series(["a", np.nan, "c"]), Series(["a", None, "c"]), Series([True, False, True]), - Series(), + Series(dtype=object), Index([1, 2, 3]), Index([True, False, True]), DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), @@ -353,3 +353,24 @@ def test_hash_collisions(): result = hash_array(np.asarray(hashes, dtype=object), "utf8") tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3) diff --git a/pandas/tests/util/test_move.py b/pandas/tests/util/test_move.py deleted file mode 100644 index 0e28dd2dd9d71..0000000000000 --- a/pandas/tests/util/test_move.py +++ /dev/null @@ -1,44 +0,0 @@ -import pytest - -from pandas.util._move import BadMove, move_into_mutable_buffer, stolenbuf - - -def test_cannot_create_instance_of_stolen_buffer(): - # Stolen buffers need to be created through the smart constructor - # "move_into_mutable_buffer," which has a bunch of checks in it. - - msg = "cannot create 'pandas.util._move.stolenbuf' instances" - with pytest.raises(TypeError, match=msg): - stolenbuf() - - -def test_more_than_one_ref(): - # Test case for when we try to use "move_into_mutable_buffer" - # when the object being moved has other references. - - b = b"testing" - - with pytest.raises(BadMove, match="testing") as e: - - def handle_success(type_, value, tb): - assert value.args[0] is b - return type(e).handle_success(e, type_, value, tb) # super - - e.handle_success = handle_success - move_into_mutable_buffer(b) - - -def test_exactly_one_ref(): - # Test case for when the object being moved has exactly one reference. - - b = b"testing" - - # We need to pass an expression on the stack to ensure that there are - # not extra references hanging around. We cannot rewrite this test as - # buf = b[:-3] - # as_stolen_buf = move_into_mutable_buffer(buf) - # because then we would have more than one reference to buf. - as_stolen_buf = move_into_mutable_buffer(b[:-3]) - - # Materialize as byte-array to show that it is mutable. - assert bytearray(as_stolen_buf) == b"test" diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 60124c8e943ad..6a19adef728e4 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -4,7 +4,7 @@ import pandas.compat as compat -import pandas.util.testing as tm +import pandas._testing as tm def test_rands(): diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index 1f1365d62c64e..746d859b3322e 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args @@ -22,10 +20,8 @@ def test_bad_arg_length_max_value_single(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -40,10 +36,8 @@ def test_bad_arg_length_max_value_multiple(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -54,15 +48,11 @@ def test_bad_arg_length_max_value_multiple(): def test_not_all_defaults(i): bad_arg = "foo" msg = ( - "the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + f"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 2 - compat_args["bar"] = -1 - compat_args["baz"] = 3 - + compat_args = {"foo": 2, "bar": -1, "baz": 3} arg_vals = (1, -1, 3) with pytest.raises(ValueError, match=msg): @@ -73,8 +63,5 @@ def test_validation(): # No exceptions should be raised. validate_args(_fname, (None,), 2, dict(out=None)) - compat_args = OrderedDict() - compat_args["axis"] = 1 - compat_args["out"] = None - + compat_args = {"axis": 1, "out": None} validate_args(_fname, (1, None), 2, compat_args) diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 396056466bb81..941ba86c61319 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args_and_kwargs @@ -17,10 +15,8 @@ def test_invalid_total_length_max_length_one(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -37,10 +33,8 @@ def test_invalid_total_length_max_length_multiple(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -52,13 +46,11 @@ def test_missing_args_or_kwargs(args, kwargs): bad_arg = "bar" min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = -5 - compat_args[bad_arg] = 1 + compat_args = {"foo": -5, bad_arg: 1} msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) with pytest.raises(ValueError, match=msg): @@ -68,17 +60,11 @@ def test_missing_args_or_kwargs(args, kwargs): def test_duplicate_argument(): min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = None - compat_args["bar"] = None - compat_args["baz"] = None - + compat_args = {"foo": None, "bar": None, "baz": None} kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" - msg = r"{fname}\(\) got multiple values for keyword " r"argument '{arg}'".format( - fname=_fname, arg="foo" - ) + msg = fr"{_fname}\(\) got multiple values for keyword argument 'foo'" with pytest.raises(TypeError, match=msg): validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) @@ -86,10 +72,7 @@ def test_duplicate_argument(): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = None - compat_args["baz"] = -2 + compat_args = {"foo": 1, "bar": None, "baz": -2} kwargs = {"baz": -2} args = (1, None) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index ec9f3948403de..a7b6d8f98cc60 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_bool_kwarg, validate_kwargs @@ -11,14 +9,10 @@ def test_bad_kwarg(): good_arg = "f" bad_arg = good_arg + "o" - compat_args = OrderedDict() - compat_args[good_arg] = "foo" - compat_args[bad_arg + "o"] = "bar" + compat_args = {good_arg: "foo", bad_arg + "o": "bar"} kwargs = {good_arg: "foo", bad_arg: "bar"} - msg = r"{fname}\(\) got an unexpected " r"keyword argument '{arg}'".format( - fname=_fname, arg=bad_arg - ) + msg = fr"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'" with pytest.raises(TypeError, match=msg): validate_kwargs(_fname, kwargs, compat_args) @@ -28,14 +22,11 @@ def test_bad_kwarg(): def test_not_all_none(i): bad_arg = "foo" msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = "s" - compat_args["baz"] = None + compat_args = {"foo": 1, "bar": "s", "baz": None} kwarg_keys = ("foo", "bar", "baz") kwarg_vals = (2, "s", None) @@ -48,10 +39,7 @@ def test_not_all_none(i): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["f"] = None - compat_args["b"] = 1 - compat_args["ba"] = "s" + compat_args = {"f": None, "b": 1, "ba": "s"} kwargs = dict(f=None, b=1) validate_kwargs(_fname, kwargs, compat_args) @@ -60,9 +48,9 @@ def test_validation(): @pytest.mark.parametrize("name", ["inplace", "copy"]) @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): - msg = 'For argument "%s" expected type bool, received type %s' % ( - name, - type(value).__name__, + msg = ( + f'For argument "{name}" expected type bool,' + f" received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 1dfc0f34b2b8d..6aeada3152dbb 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -3,7 +3,8 @@ import numpy as np from numpy.random import randn -from pandas import DataFrame, Series, bdate_range +from pandas import DataFrame, Series, bdate_range, notna +import pandas._testing as tm N, K = 100, 10 @@ -21,3 +22,365 @@ def _create_data(self): self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(dtype=object), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +class ConsistencyBase(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def _test_moments_consistency_mock_mean(self, mean, mock_mean): + for (x, is_constant, no_nans) in self.data: + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr): + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + def _test_moments_consistency_var_debiasing_factors( + self, var_biased=None, var_unbiased=None, var_debiasing_factors=None + ): + for (x, is_constant, no_nans) in self.data: + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + ): + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) + + +def ew_func(A, B, com, name, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + +def check_binary_ew(name, A, B): + + result = ew_func(A=A, B=B, com=20, name=name, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + +def check_binary_ew_min_periods(name, min_periods, A, B): + # GH 7898 + result = ew_func(A, B, 20, name=name, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = ew_func(empty, empty, 50, name=name, min_periods=min_periods) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = ew_func( + Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods + ) + tm.assert_series_equal(result, Series([np.NaN])) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7ea4be25ca2a6..fb46ca51ace58 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,5 +1,7 @@ import pytest +import pandas.util._test_decorators as td + @pytest.fixture(params=[True, False]) def raw(request): @@ -47,3 +49,41 @@ def center(request): @pytest.fixture(params=[None, 1]) def min_periods(request): return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + """parallel keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nogil(request): + """nogil keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nopython(request): + """nopython keyword argument for numba.jit""" + return request.param + + +@pytest.fixture( + params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] +) +def engine(request): + """engine keyword argument for rolling.apply""" + return request.param + + +@pytest.fixture( + params=[ + pytest.param(("numba", True), marks=td.skip_if_no("numba", "0.46.0")), + ("cython", True), + ("cython", False), + ] +) +def engine_and_raw(request): + """engine and raw keyword arguments for rolling.apply""" + return request.param diff --git a/pandas/tests/window/moments/conftest.py b/pandas/tests/window/moments/conftest.py new file mode 100644 index 0000000000000..2002f4d0bff43 --- /dev/null +++ b/pandas/tests/window/moments/conftest.py @@ -0,0 +1,20 @@ +import numpy as np +from numpy.random import randn +import pytest + +from pandas import Series + + +@pytest.fixture +def binary_ew_data(): + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + return A, B + + +@pytest.fixture(params=[0, 1, 2]) +def min_periods(request): + return request.param diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py new file mode 100644 index 0000000000000..599761259e041 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -0,0 +1,439 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat +import pandas._testing as tm +from pandas.tests.window.common import ( + Base, + ConsistencyBase, + check_binary_ew, + check_binary_ew_min_periods, + ew_func, +) + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestMoments(Base): + def setup_method(self, method): + self._create_data() + + def test_ewma(self): + self._check_ew(name="mean") + + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() + assert np.abs(result - 1) < 1e-2 + + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + + s = Series([1.0, 2.0, 4.0, 8.0]) + + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) + + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + + def test_ewma_nan_handling(self): + s = Series([1.0] + [np.nan] * 5 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([1.0] * len(s))) + + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) + + # GH 7603 + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) + + def simple_wma(s, w): + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") + + for (s, adjust, ignore_na, w) in [ + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: + expected = simple_wma(s, Series(w)) + result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=com, adjust=adjust).mean() + tm.assert_series_equal(result, expected) + + def test_ewmvar(self): + self._check_ew(name="var") + + def test_ewmvol(self): + self._check_ew(name="vol") + + def test_ewma_span_com_args(self): + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() + + def test_ewma_halflife_arg(self): + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() + + def test_ewm_alpha(self): + # GH 10789 + s = Series(self.arr) + a = s.ewm(alpha=0.61722699889169674).mean() + b = s.ewm(com=0.62014947789973052).mean() + c = s.ewm(span=2.240298955799461).mean() + d = s.ewm(halflife=0.721792864318).mean() + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) + + def test_ewm_alpha_arg(self): + # GH 10789 + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) + + def test_ewm_domain_checks(self): + # GH 12492 + s = Series(self.arr) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) + s.ewm(com=0.0) + s.ewm(com=0.1) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) + s.ewm(span=1.0) + s.ewm(span=1.1) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) + s.ewm(halflife=0.1) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) + s.ewm(alpha=0.1) + s.ewm(alpha=1.0) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) + + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) + def test_ew_empty_series(self, method): + vals = pd.Series([], dtype=np.float64) + + ewm = vals.ewm(3) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) + + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame + + result = getattr(self.series.ewm(com=10), name)() + if preserve_nan: + assert result[self._nan_locs].isna().all() + + @pytest.mark.parametrize("min_periods", [0, 1]) + @pytest.mark.parametrize("name", ["mean", "var", "vol"]) + def test_ew_min_periods(self, min_periods, name): + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + s = Series(arr) + + # check min_periods + # GH 7898 + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() + + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == "mean": + assert result[:10].isna().all() + assert not result[10:].isna().any() + else: + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() + + # check series of length 0 + result = getattr( + Series(dtype=object).ewm(com=50, min_periods=min_periods), name + )() + tm.assert_series_equal(result, Series(dtype="float64")) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) + else: + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values + tm.assert_series_equal(result, Series([np.NaN])) + + # pass in ints + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() + assert result2.dtype == np.float_ + + +class TestEwmMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_ewm_corr_cov(self, name, min_periods, binary_ew_data): + A, B = binary_ew_data + + check_binary_ew(name="corr", A=A, B=B) + check_binary_ew_min_periods("corr", min_periods, A, B) + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_different_input_array_raise_exception(self, name, binary_ew_data): + + A, _ = binary_ew_data + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + ew_func(A, randn(50), 20, name=name, min_periods=5) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) + w.index = s.index + w.columns = s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1.0 / (1.0 + com) + if ignore_na: + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1.0 / (1.0 - alpha), i) + else: + sum_wts = 0.0 + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1.0 + else: + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.0] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan + return result + + com = 3.0 + self._test_moments_consistency_mock_mean( + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + std_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), + cov_unbiased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), + ) diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py new file mode 100644 index 0000000000000..4596552d8f255 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -0,0 +1,409 @@ +import warnings + +import numpy as np +from numpy.random import randn +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, isna, notna +import pandas._testing as tm +from pandas.tests.window.common import ConsistencyBase + + +class TestExpandingMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_expanding_apply_args_kwargs(self, raw): + def mean_w_arg(x, const): + return np.mean(x) + const + + df = DataFrame(np.random.rand(20, 3)) + + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = A.expanding().corr(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = self.series.expanding().count() + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) + + def test_expanding_quantile(self): + result = self.series.expanding().quantile(0.5) + + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) + + tm.assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = A.expanding().cov(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_cov_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_corr_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().cov(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().cov(s2) + expected = Series([None, None, None, 4.5]) + tm.assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().corr(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().corr(s2) + expected = Series([None, None, None, 1.0]) + tm.assert_series_equal(result, expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) + def test_expanding_func(self, func, static_comp, has_min_periods): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + return getattr(exp, func)() + + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + self._check_expanding_has_min_periods( + expanding_func, static_comp, has_min_periods + ) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + def test_expanding_apply(self, raw, has_min_periods): + def expanding_mean(x, min_periods=1): + + exp = x.expanding(min_periods=min_periods) + result = exp.apply(lambda x: x.mean(), raw=raw) + return result + + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) + self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) + + def test_expanding_apply_empty_series(self, raw): + ser = Series([], dtype=np.float64) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + + def test_expanding_apply_min_periods_0(self, raw): + # GH 8080 + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def _check_expanding(self, func, static_comp, preserve_nan=True): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) + + if preserve_nan: + assert result.iloc[self._nan_locs].isna().all() + + def _check_expanding_has_min_periods(self, func, static_comp, has_min_periods): + ser = Series(randn(50)) + + if has_min_periods: + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + # min_periods is working correctly + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) + + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) + assert isna(result[3]) + assert notna(result[4]) + + # min_periods=0 + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + ], + ) + def test_moment_functions_zero_length(self, f): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.parametrize( + "f", + [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + ], + ) + def test_moment_functions_zero_length_pairwise(self, f): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) + ) + df2_expected = DataFrame( + index=MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different expanding_* moments + self._test_moments_consistency_mock_mean( + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + var_debiasing_factors=lambda x: ( + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), + ) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr(x.expanding(min_periods=min_periods), name) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) + else: + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) + else: + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(expanding_f_result, expanding_apply_f_result) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/moments/test_moments_rolling.py similarity index 52% rename from pandas/tests/window/test_moments.py rename to pandas/tests/window/moments/test_moments_rolling.py index 3d6cd7d10bd10..9acb4ffcb40b8 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -9,10 +9,10 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, concat, isna, notna +from pandas import DataFrame, Index, Series, isna, notna +import pandas._testing as tm from pandas.core.window.common import _flex_binary_moment -from pandas.tests.window.common import Base -import pandas.util.testing as tm +from pandas.tests.window.common import Base, ConsistencyBase import pandas.tseries.offsets as offsets @@ -108,7 +108,7 @@ def test_cmov_window_corner(self): assert np.isnan(result).all() # empty - vals = pd.Series([]) + vals = pd.Series([], dtype=object) result = vals.rolling(5, center=True, win_type="boxcar").mean() assert len(result) == 0 @@ -119,64 +119,95 @@ def test_cmov_window_corner(self): assert len(result) == 5 @td.skip_if_no_scipy - def test_cmov_window_frame(self): + @pytest.mark.parametrize( + "f,xp", + [ + ( + "mean", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [9.252, 9.392], + [8.644, 9.906], + [8.87, 10.208], + [6.81, 8.588], + [7.792, 8.644], + [9.05, 7.824], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "std", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [3.789706, 4.068313], + [3.429232, 3.237411], + [3.589269, 3.220810], + [3.405195, 2.380655], + [3.281839, 2.369869], + [3.676846, 1.801799], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "var", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [14.36187, 16.55117], + [11.75963, 10.48083], + [12.88285, 10.37362], + [11.59535, 5.66752], + [10.77047, 5.61628], + [13.51920, 3.24648], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + "sum", + [ + [np.nan, np.nan], + [np.nan, np.nan], + [46.26, 46.96], + [43.22, 49.53], + [44.35, 51.04], + [34.05, 42.94], + [38.96, 43.22], + [45.25, 39.12], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ], + ) + def test_cmov_window_frame(self, f, xp): # Gh 8238 - vals = np.array( - [ - [12.18, 3.64], - [10.18, 9.16], - [13.24, 14.61], - [4.51, 8.11], - [6.15, 11.44], - [9.14, 6.21], - [11.31, 10.67], - [2.94, 6.51], - [9.42, 8.39], - [12.44, 7.34], - ] - ) - - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [9.252, 9.392], - [8.644, 9.906], - [8.87, 10.208], - [6.81, 8.588], - [7.792, 8.644], - [9.05, 7.824], - [np.nan, np.nan], - [np.nan, np.nan], - ] + df = DataFrame( + np.array( + [ + [12.18, 3.64], + [10.18, 9.16], + [13.24, 14.61], + [4.51, 8.11], + [6.15, 11.44], + [9.14, 6.21], + [11.31, 10.67], + [2.94, 6.51], + [9.42, 8.39], + [12.44, 7.34], + ] + ) ) + xp = DataFrame(np.array(xp)) - # DataFrame - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() - tm.assert_frame_equal(DataFrame(xp), rs) + roll = df.rolling(5, win_type="boxcar", center=True) + rs = getattr(roll, f)() - # invalid method - with pytest.raises(AttributeError): - (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) - - # sum - xp = np.array( - [ - [np.nan, np.nan], - [np.nan, np.nan], - [46.26, 46.96], - [43.22, 49.53], - [44.35, 51.04], - [34.05, 42.94], - [38.96, 43.22], - [45.25, 39.12], - [np.nan, np.nan], - [np.nan, np.nan], - ] - ) - - rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() - tm.assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_na_min_periods(self): @@ -643,64 +674,6 @@ def f(x): self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - expected = Series([]) - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) - - # gh-8080 - s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 2.0]) - tm.assert_series_equal(result, expected) - - result = s.rolling(2, min_periods=0).apply(len, raw=raw) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("klass", [Series, DataFrame]) - @pytest.mark.parametrize( - "method", [lambda x: x.rolling(window=2), lambda x: x.expanding()] - ) - def test_apply_future_warning(self, klass, method): - - # gh-5071 - s = klass(np.arange(3)) - - with tm.assert_produces_warning(FutureWarning): - method(s).apply(lambda x: len(x)) - - def test_rolling_apply_out_of_bounds(self, raw): - # gh-1850 - vals = pd.Series([1, 2, 3, 4]) - - result = vals.rolling(10).apply(np.sum, raw=raw) - assert result.isna().all() - - result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) - expected = pd.Series([1, 3, 6, 10], dtype=float) - tm.assert_almost_equal(result, expected) - - @pytest.mark.parametrize("window", [2, "2s"]) - def test_rolling_apply_with_pandas_objects(self, window): - # 5071 - df = pd.DataFrame( - {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, - index=pd.date_range("20130101", periods=5, freq="s"), - ) - - # we have an equal spaced timeseries index - # so simulate removing the first period - def f(x): - if x.index[0] == df.index[0]: - return np.nan - return x.iloc[-1] - - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) - tm.assert_frame_equal(result, expected) - - with pytest.raises(AttributeError): - df.rolling(window).apply(f, raw=True) - def test_rolling_std(self, raw): self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) self._check_moment_func( @@ -769,7 +742,7 @@ def _check_moment_func( has_time_rule=True, fill_value=None, zero_min_periods_equal=True, - **kwargs + **kwargs, ): # inject raw @@ -942,398 +915,6 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_series_equal(series_xp, series_rs) tm.assert_frame_equal(frame_xp, frame_rs) - def test_ewma(self): - self._check_ew(name="mean") - - vals = pd.Series(np.zeros(1000)) - vals[5] = 1 - result = vals.ewm(span=100, adjust=False).mean().sum() - assert np.abs(result - 1) < 1e-2 - - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewma_cases(self, adjust, ignore_na): - # try adjust/ignore_na args matrix - - s = Series([1.0, 2.0, 4.0, 8.0]) - - if adjust: - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - else: - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - - result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() - tm.assert_series_equal(result, expected) - - def test_ewma_nan_handling(self): - s = Series([1.0] + [np.nan] * 5 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.0] * len(s))) - - s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - ( - s2, - True, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], - ), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), - ( - s2, - False, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], - ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), - ( - s3, - False, - False, - [ - (1.0 - alpha) ** 3, - np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), - ], - ), - ( - s3, - False, - True, - [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], - ), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - - tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - def test_ewmvar(self): - self._check_ew(name="var") - - def test_ewmvol(self): - self._check_ew(name="vol") - - def test_ewma_span_com_args(self): - A = self.series.ewm(com=9.5).mean() - B = self.series.ewm(span=20).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): - self.series.ewm().mean() - - def test_ewma_halflife_arg(self): - A = self.series.ewm(com=13.932726172912965).mean() - B = self.series.ewm(halflife=10.0).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm() - - def test_ewm_alpha(self): - # GH 10789 - s = Series(self.arr) - a = s.ewm(alpha=0.61722699889169674).mean() - b = s.ewm(com=0.62014947789973052).mean() - c = s.ewm(span=2.240298955799461).mean() - d = s.ewm(halflife=0.721792864318).mean() - tm.assert_series_equal(a, b) - tm.assert_series_equal(a, c) - tm.assert_series_equal(a, d) - - def test_ewm_alpha_arg(self): - # GH 10789 - s = self.series - with pytest.raises(ValueError): - s.ewm() - with pytest.raises(ValueError): - s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(halflife=10.0, alpha=0.5) - - def test_ewm_domain_checks(self): - # GH 12492 - s = Series(self.arr) - msg = "comass must satisfy: comass >= 0" - with pytest.raises(ValueError, match=msg): - s.ewm(com=-0.1) - s.ewm(com=0.0) - s.ewm(com=0.1) - - msg = "span must satisfy: span >= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(span=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.0) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.9) - s.ewm(span=1.0) - s.ewm(span=1.1) - - msg = "halflife must satisfy: halflife > 0" - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=0.0) - s.ewm(halflife=0.1) - - msg = "alpha must satisfy: 0 < alpha <= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=0.0) - s.ewm(alpha=0.1) - s.ewm(alpha=1.0) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=1.1) - - @pytest.mark.parametrize("method", ["mean", "vol", "var"]) - def test_ew_empty_series(self, method): - vals = pd.Series([], dtype=np.float64) - - ewm = vals.ewm(3) - result = getattr(ewm, method)() - tm.assert_almost_equal(result, vals) - - def _check_ew(self, name=None, preserve_nan=False): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(self.series.ewm(com=10), name)() - if preserve_nan: - assert result[self._nan_locs].isna().all() - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - s = Series(arr) - - # check min_periods - # GH 7898 - result = getattr(s.ewm(com=50, min_periods=2), name)() - assert result[:11].isna().all() - assert not result[11:].isna().any() - - for min_periods in (0, 1): - result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == "mean": - assert result[:10].isna().all() - assert not result[10:].isna().any() - else: - # ewm.std, ewm.vol, ewm.var (with bias=False) require at least - # two values - assert result[:11].isna().all() - assert not result[11:].isna().any() - - # check series of length 0 - result = getattr(Series().ewm(com=50, min_periods=min_periods), name)() - tm.assert_series_equal(result, Series()) - - # check series of length 1 - result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() - if name == "mean": - tm.assert_series_equal(result, Series([1.0])) - else: - # ewm.std, ewm.vol, ewm.var with bias=False require at least - # two values - tm.assert_series_equal(result, Series([np.NaN])) - - # pass in ints - result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ - - -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.0]), - Series([np.nan, 3.0]), - Series([3.0, np.nan]), - Series([1.0, 3.0]), - Series([2.0, 2.0]), - Series([3.0, 1.0]), - Series( - [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] - ), - Series( - [ - np.nan, - 5.0, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - np.nan, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series(range(10)), - Series(range(20, 0, -2)), - ] - - def create_dataframes(): - return [ - DataFrame(), - DataFrame(columns=["a"]), - DataFrame(columns=["a", "a"]), - DataFrame(columns=["a", "b"]), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel() - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - - # data is a tuple(object, is_constant, no_nans) - data = create_series() + create_dataframes() - - return [(x, is_constant(x), no_nans(x)) for x in data] - - -_consistency_data = _create_consistency_data() - def _rolling_consistency_cases(): for window in [1, 2, 3, 10, 20]: @@ -1344,363 +925,10 @@ def _rolling_consistency_cases(): yield window, min_periods, center -class TestMomentsConsistency(Base): - base_functions = [ - (lambda v: Series(v).count(), None, "count"), - (lambda v: Series(v).max(), None, "max"), - (lambda v: Series(v).min(), None, "min"), - (lambda v: Series(v).sum(), None, "sum"), - (lambda v: Series(v).mean(), None, "mean"), - (lambda v: Series(v).std(), 1, "std"), - (lambda v: Series(v).cov(Series(v)), None, "cov"), - (lambda v: Series(v).corr(Series(v)), None, "corr"), - (lambda v: Series(v).var(), 1, "var"), - # restore once GH 8086 is fixed - # lambda v: Series(v).skew(), 3, 'skew'), - # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed - # lambda v: Series(v).quantile(0.3), None, 'quantile'), - (lambda v: Series(v).median(), None, "median"), - (np.nanmax, 1, "max"), - (np.nanmin, 1, "min"), - (np.nansum, 1, "sum"), - (np.nanmean, 1, "mean"), - (lambda v: np.nanstd(v, ddof=1), 1, "std"), - (lambda v: np.nanvar(v, ddof=1), 1, "var"), - (np.nanmedian, 1, "median"), - ] - no_nan_functions = [ - (np.max, None, "max"), - (np.min, None, "min"), - (np.sum, None, "sum"), - (np.mean, None, "mean"), - (lambda v: np.std(v, ddof=1), 1, "std"), - (lambda v: np.var(v, ddof=1), 1, "var"), - (np.median, None, "median"), - ] - - def _create_data(self): - super()._create_data() - self.data = _consistency_data - +class TestRollingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def _test_moments_consistency( - self, - min_periods, - count, - mean, - mock_mean, - corr, - var_unbiased=None, - std_unbiased=None, - cov_unbiased=None, - var_biased=None, - std_biased=None, - cov_biased=None, - var_debiasing_factors=None, - ): - def _non_null_values(x): - values = x.values.ravel() - return set(values[notna(values)].tolist()) - - for (x, is_constant, no_nans) in self.data: - count_x = count(x) - mean_x = mean(x) - - if mock_mean: - # check that mean equals mock_mean - expected = mock_mean(x) - tm.assert_equal(mean_x, expected.astype("float64")) - - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = corr(x, x) - - # assert _non_null_values(corr_x_x).issubset(set([1.])) - # restore once rolling_cov(x, x) is identically equal to var(x) - - if is_constant: - exp = x.max() if isinstance(x, Series) else x.max().max() - - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) - - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) - - if var_unbiased and var_biased and var_debiasing_factors: - # check variance debiasing factors - var_unbiased_x = var_unbiased(x) - var_biased_x = var_biased(x) - var_debiasing_factors_x = var_debiasing_factors(x) - tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - - for (std, var, cov) in [ - (std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased), - ]: - - # check that var(x), std(x), and cov(x) are all >= 0 - var_x = var(x) - std_x = std(x) - assert not (var_x < 0).any().any() - assert not (std_x < 0).any().any() - if cov: - cov_x_x = cov(x, x) - assert not (cov_x_x < 0).any().any() - - # check that var(x) == cov(x, x) - tm.assert_equal(var_x, cov_x_x) - - # check that var(x) == std(x)^2 - tm.assert_equal(var_x, std_x * std_x) - - if var is var_biased: - # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = mean(x * x) - tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) - - if is_constant: - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if var is var_unbiased: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) - - if isinstance(x, Series): - for (y, is_constant, no_nans) in self.data: - if not x.isna().equals(y.isna()): - # can only easily test two Series with similar - # structure - continue - - # check that cor(x, y) is symmetric - corr_x_y = corr(x, y) - corr_y_x = corr(y, x) - tm.assert_equal(corr_x_y, corr_y_x) - - if cov: - # check that cov(x, y) is symmetric - cov_x_y = cov(x, y) - cov_y_x = cov(y, x) - tm.assert_equal(cov_x_y, cov_y_x) - - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - var_x_plus_y = var(x + y) - var_y = var(y) - tm.assert_equal( - cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) - ) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - std_y = std(y) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if cov is cov_biased: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_y = mean(y) - mean_x_times_y = mean(x * y) - tm.assert_equal( - cov_x_y, mean_x_times_y - (mean_x * mean_y) - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewm_consistency(self, min_periods, adjust, ignore_na): - def _weights(s, com, adjust, ignore_na): - if isinstance(s, DataFrame): - if not len(s.columns): - return DataFrame(index=s.index, columns=s.columns) - w = concat( - [ - _weights( - s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na - ) - for i, _ in enumerate(s.columns) - ], - axis=1, - ) - w.index = s.index - w.columns = s.columns - return w - - w = Series(np.nan, index=s.index) - alpha = 1.0 / (1.0 + com) - if ignore_na: - w[s.notna()] = _weights( - s[s.notna()], com=com, adjust=adjust, ignore_na=False - ) - elif adjust: - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1.0 / (1.0 - alpha), i) - else: - sum_wts = 0.0 - prev_i = -1 - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - if prev_i == -1: - w.iat[i] = 1.0 - else: - w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) - sum_wts += w.iat[i] - prev_i = i - return w - - def _variance_debiasing_factors(s, com, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method="ffill") - cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") - numerator = cum_sum * cum_sum - denominator = numerator - cum_sum_sq - denominator[denominator <= 0.0] = np.nan - return numerator / denominator - - def _ewma(s, com, min_periods, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = ( - s.multiply(weights) - .cumsum() - .divide(weights.cumsum()) - .fillna(method="ffill") - ) - result[ - s.expanding().count() < (max(min_periods, 1) if min_periods else 1) - ] = np.nan - return result - - com = 3.0 - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean(), - mock_mean=lambda x: _ewma( - x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ), - corr=lambda x, y: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(y), - var_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=False) - ), - std_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=False) - ), - cov_unbiased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=False) - ), - var_biased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=True) - ), - std_biased=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=True) - ), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors( - x, com=com, adjust=adjust, ignore_na=ignore_na - ) - ), - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - def test_expanding_consistency(self, min_periods): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding(min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() - / x.expanding().count(), - corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( - y, ddof=0 - ), - var_debiasing_factors=lambda x: ( - x.expanding().count() - / (x.expanding().count() - 1.0).replace(0.0, np.nan) - ), - ) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr(x.expanding(min_periods=min_periods), name) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding(min_periods=0).apply( - func=f, raw=True - ) - else: - if name in ["cov", "corr"]: - expanding_f_result = expanding_f(pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(expanding_f_result, expanding_apply_f_result) - @pytest.mark.slow @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) @@ -1717,9 +945,7 @@ def test_rolling_consistency(self, window, min_periods, center): ) # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + self._test_moments_consistency_mock_mean( mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1734,41 +960,34 @@ def test_rolling_consistency(self, window, min_periods, center): ).count() ) ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), corr=lambda x, y: ( x.rolling( window=window, min_periods=min_periods, center=center ).corr(y) ), + ) + + self._test_moments_consistency_var_debiasing_factors( var_unbiased=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center ).var() ), - std_unbiased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).std() - ), - cov_unbiased=lambda x, y: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).cov(y) - ), var_biased=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center ).var(ddof=0) ), - std_biased=lambda x: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).std(ddof=0) - ), - cov_biased=lambda x, y: ( - x.rolling( - window=window, min_periods=min_periods, center=center - ).cov(y, ddof=0) - ), var_debiasing_factors=lambda x: ( x.rolling(window=window, center=center) .count() @@ -1780,19 +999,64 @@ def test_rolling_consistency(self, window, min_periods, center): ), ) - # test consistency between rolling_xyz() and either (a) - # rolling_apply of Series.xyz(), or (b) rolling_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - rolling_f = getattr( - x.rolling( - window=window, center=center, min_periods=min_periods + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), + corr=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), + var_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), + std_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).std() + ), + cov_unbiased=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y) + ), + var_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), + std_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=0) + ), + cov_biased=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y, ddof=0) + ), + ) + + # test consistency between rolling_xyz() and either (a) + # rolling_apply of Series.xyz(), or (b) rolling_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + rolling_f = getattr( + x.rolling( + window=window, center=center, min_periods=min_periods ), name, ) @@ -1860,22 +1124,12 @@ def test_rolling_corr_with_zero_variance(self, window): assert s.rolling(window=window).corr(other=other).isna().all() - def _check_pairwise_moment(self, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(self.frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(result, expected, check_names=False) - def test_flex_binary_moment(self): # GH3155 # don't blow the stack msg = ( - "arguments to moment function must be of type" - " np.ndarray/Series/DataFrame" + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" ) with pytest.raises(TypeError, match=msg): _flex_binary_moment(5, 6, None) @@ -1930,155 +1184,6 @@ def test_flex_binary_frame(self, method): ) tm.assert_frame_equal(res3, exp) - def test_ewmcov(self): - self._check_binary_ew("cov") - - def test_ewmcov_pairwise(self): - self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) - - def test_ewmcorr(self): - self._check_binary_ew("corr") - - def test_ewmcorr_pairwise(self): - self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) - - def _check_binary_ew(self, name): - def func(A, B, com, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - A = Series(randn(50), index=np.arange(50)) - B = A[2:] + randn(48) - - A[:10] = np.NaN - B[-10:] = np.NaN - - result = func(A, B, 20, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - # GH 7898 - for min_periods in (0, 1, 2): - result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - result = func(Series([]), Series([]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([])) - - # check series of length 1 - result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([np.NaN])) - - msg = "Input arrays must be of the same type!" - # exception raised is Exception - with pytest.raises(Exception, match=msg): - func(A, randn(50), 20, min_periods=5) - - def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): - return np.mean(x) + const - - df = DataFrame(np.random.rand(20, 3)) - - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) - tm.assert_frame_equal(result, expected) - - result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr(self): - A = self.series.dropna() - B = (A + randn(len(A)))[:-5] - - result = A.expanding().corr(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_count(self): - result = self.series.expanding().count() - tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() - ) - - def test_expanding_quantile(self): - result = self.series.expanding().quantile(0.5) - - rolling_result = self.series.rolling( - window=len(self.series), min_periods=1 - ).quantile(0.5) - - tm.assert_almost_equal(result, rolling_result) - - def test_expanding_cov(self): - A = self.series - B = (A + randn(len(A)))[:-5] - - result = A.expanding().cov(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_cov_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_corr_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_cov_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().cov(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) - tm.assert_series_equal(result, expected) - - def test_expanding_corr_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().corr(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) - tm.assert_series_equal(result, expected) - def test_rolling_cov_diff_length(self): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) @@ -2106,8 +1211,8 @@ def test_rolling_corr_diff_length(self): @pytest.mark.parametrize( "f", [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2123,6 +1228,7 @@ def test_rolling_corr_diff_length(self): lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), ], ) + @td.skip_if_no_scipy def test_rolling_functions_window_non_shrinkage(self, f): # GH 7764 s = Series(range(4)) @@ -2130,16 +1236,11 @@ def test_rolling_functions_window_non_shrinkage(self, f): df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df_result = f(df) - tm.assert_frame_equal(df_result, df_expected) - except (ImportError): + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) - # scipy needed for rolling_window - pytest.skip("scipy not available") + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) def test_rolling_functions_window_non_shrinkage_binary(self): @@ -2164,154 +1265,6 @@ def test_rolling_functions_window_non_shrinkage_binary(self): df_result = f(df) tm.assert_frame_equal(df_result, df_expected) - def test_moment_functions_zero_length(self): - # GH 8056 - s = Series() - s_expected = s - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=["a"]) - df2["a"] = df2["a"].astype("float64") - df2_expected = df2 - - functions = [ - lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum, raw=False), - lambda x: x.expanding(min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - except (ImportError): - - # scipy needed for rolling_window - continue - - def test_moment_functions_zero_length_pairwise(self): - - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) - df2["a"] = df2["a"].astype("float64") - - df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([]), - ) - df2_expected = DataFrame( - index=pd.MultiIndex.from_product( - [df2.index, df2.columns], names=["bar", "foo"] - ), - columns=Index(["a"], name="foo"), - dtype="float64", - ) - - functions = [ - lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - - def test_expanding_cov_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) - df1a = DataFrame( - [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") - ) - # TODO: xref gh-15826 - # .loc is not preserving the names - result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] - result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(["A", "B"], name="foo"), - index=Index(["X", "Y"], name="foo"), - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_expanding_corr_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], - columns=["A", "B"], - index=Index(range(3), name="bar"), - ) - df1a = DataFrame( - [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], - columns=["X", "Y"], - index=Index(range(3), name="bar"), - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] - ) - result1 = df1.expanding().corr(df2, pairwise=True).loc[2] - result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] - result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) @@ -2362,83 +1315,6 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize( - "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], - ids=["sum", "mean", "max", "min"], - ) - def test_expanding_func(self, func, static_comp): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) - return getattr(exp, func)() - - self._check_expanding(expanding_func, static_comp, preserve_nan=False) - - def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): - - exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) - return result - - # TODO(jreback), needed to add preserve_nan=False - # here to make this pass - self._check_expanding(expanding_mean, np.mean, preserve_nan=False) - - ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(result, expected) - - def _check_expanding( - self, - func, - static_comp, - has_min_periods=True, - has_time_rule=True, - preserve_nan=True, - ): - - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert isinstance(frame_result, DataFrame) - - result = func(self.series) - tm.assert_almost_equal(result[10], static_comp(self.series[:11])) - - if preserve_nan: - assert result.iloc[self._nan_locs].isna().all() - - ser = Series(randn(50)) - - if has_min_periods: - result = func(ser, min_periods=30) - assert result[:29].isna().all() - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - # min_periods is working correctly - result = func(ser, min_periods=15) - assert isna(result.iloc[13]) - assert notna(result.iloc[14]) - - ser2 = Series(randn(20)) - result = func(ser2, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - # min_periods=0 - result0 = func(ser, min_periods=0) - result1 = func(ser, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = func(ser) - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" @@ -2560,3 +1436,76 @@ def test_rolling_min_max_numeric_types(self): assert result.dtypes[0] == np.dtype("f8") result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") + + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + functions = [ + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] + for f in functions: + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([]), + ) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + + for f in functions: + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 11527efa4c39f..5e70e13209de5 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,6 +1,4 @@ from collections import OrderedDict -import warnings -from warnings import catch_warnings import numpy as np import pytest @@ -9,9 +7,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestApi(Base): @@ -82,7 +80,6 @@ def test_agg(self): a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() - b_sum = r["B"].sum() result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -104,26 +101,18 @@ def test_agg(self): expected.columns = ["mean", "sum"] tm.assert_frame_equal(result, expected) - with catch_warnings(record=True): + msg = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): # using a dict with renaming - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) - tm.assert_frame_equal(result, expected, check_like=True) + r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate( + with pytest.raises(SpecificationError, match=msg): + r.aggregate( { "A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}, } ) - expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] - expected.columns = pd.MultiIndex.from_tuples(exp_cols) - tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) @@ -168,7 +157,7 @@ def test_agg_nested_dicts(self): df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" + msg = "nested renamer is not supported" with pytest.raises(SpecificationError, match=msg): r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) @@ -178,25 +167,13 @@ def test_agg_nested_dicts(self): expected.columns = pd.MultiIndex.from_tuples( [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] ) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r[["A", "B"]].agg( + with pytest.raises(SpecificationError, match=msg): + r[["A", "B"]].agg( {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} ) - tm.assert_frame_equal(result, expected, check_like=True) - with catch_warnings(record=True): - warnings.simplefilter("ignore", FutureWarning) - result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) - expected.columns = pd.MultiIndex.from_tuples( - [ - ("A", "ra", "mean"), - ("A", "ra", "std"), - ("B", "rb", "mean"), - ("B", "rb", "std"), - ] - ) - tm.assert_frame_equal(result, expected, check_like=True) + with pytest.raises(SpecificationError, match=msg): + r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) def test_count_nonnumeric_types(self): # GH12541 diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py new file mode 100644 index 0000000000000..7132e64c1191c --- /dev/null +++ b/pandas/tests/window/test_apply.py @@ -0,0 +1,140 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +@pytest.mark.parametrize("bad_raw", [None, 1, 0]) +def test_rolling_apply_invalid_raw(bad_raw): + with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): + Series(range(3)).rolling(1).apply(len, raw=bad_raw) + + +def test_rolling_apply_out_of_bounds(engine_and_raw): + # gh-1850 + engine, raw = engine_and_raw + + vals = Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw) + expected = Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("window", [2, "2s"]) +def test_rolling_apply_with_pandas_objects(window): + # 5071 + df = DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + +def test_rolling_apply(engine_and_raw): + engine, raw = engine_and_raw + + expected = Series([], dtype="float64") + result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + +def test_all_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = ( + DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + * 2 + ) + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, engine=engine, raw=raw) + expected = er.apply(lambda x: 1, engine=engine, raw=raw) + tm.assert_frame_equal(result, expected) + + +def test_ragged_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = DataFrame({"B": range(5)}) + df.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + +def test_invalid_engine(): + with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + +def test_invalid_engine_kwargs_cython(): + with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + +def test_invalid_raw_numba(): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + +@td.skip_if_no("numba") +def test_invalid_kwargs_nopython(): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py new file mode 100644 index 0000000000000..606520c6d68ca --- /dev/null +++ b/pandas/tests/window/test_base_indexer.py @@ -0,0 +1,82 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm +from pandas.api.indexers import BaseIndexer +from pandas.core.window.indexers import ExpandingIndexer + + +def test_bad_get_window_bounds_signature(): + class BadIndexer(BaseIndexer): + def get_window_bounds(self): + return None + + indexer = BadIndexer() + with pytest.raises(ValueError, match="BadIndexer does not implement"): + Series(range(5)).rolling(indexer) + + +def test_expanding_indexer(): + s = Series(range(10)) + indexer = ExpandingIndexer() + result = s.rolling(indexer).mean() + expected = s.expanding().mean() + tm.assert_series_equal(result, expected) + + +def test_indexer_constructor_arg(): + # Example found in computation.rst + use_expanding = [True, False, True, False, True] + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + result = df.rolling(indexer).sum() + expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]}) + tm.assert_frame_equal(result, expected) + + +def test_indexer_accepts_rolling_args(): + df = DataFrame({"values": range(5)}) + + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + for i in range(num_values): + if center and min_periods == 1 and closed == "both" and i == 2: + start[i] = 0 + end[i] = num_values + else: + start[i] = i + end[i] = i + self.window_size + return start, end + + indexer = CustomIndexer(window_size=1) + result = df.rolling(indexer, center=True, min_periods=1, closed="both").sum() + expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + +def test_win_type_not_implemented(): + class CustomIndexer(BaseIndexer): + def get_window_bounds(self, num_values, min_periods, center, closed): + return np.array([0, 1]), np.array([1, 2]) + + df = DataFrame({"values": range(2)}) + indexer = CustomIndexer() + with pytest.raises(NotImplementedError, match="BaseIndexer subclasses not"): + df.rolling(indexer, win_type="boxcar") diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 9d023034c570a..b1c9b66ab09d3 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 098acdff93ac6..fc4bd50f25c73 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.window import Expanding from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestExpanding(Base): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index b726bd3e3c8a7..355ef3a90d424 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -3,7 +3,8 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.groupby.groupby import get_groupby class TestGrouperGrouping: @@ -13,18 +14,18 @@ def setup_method(self, method): def test_mutated(self): - msg = r"group\(\) got an unexpected keyword argument 'foo'" + msg = r"groupby\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): self.frame.groupby("A", foo=1) g = self.frame.groupby("A") assert not g.mutated - g = self.frame.groupby("A", mutated=True) + g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated def test_getitem(self): g = self.frame.groupby("A") - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) @@ -45,7 +46,7 @@ def test_getitem_multiple(self): # GH 13174 g = self.frame.groupby("A") r = g.rolling(2) - g_mutated = self.frame.groupby("A", mutated=True) + g_mutated = get_groupby(self.frame, by="A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() @@ -59,7 +60,6 @@ def test_rolling(self): r = g.rolling(window=4) for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -69,8 +69,16 @@ def test_rolling(self): expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_rolling_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.rolling(window=4) + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): @@ -141,8 +149,16 @@ def test_expanding(self): expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) tm.assert_frame_equal(result, expected) - result = r.quantile(0.5) - expected = g.apply(lambda x: x.expanding().quantile(0.5)) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] + ) + def test_expanding_quantile(self, interpolation): + g = self.frame.groupby("A") + r = g.expanding() + result = r.quantile(0.4, interpolation=interpolation) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py new file mode 100644 index 0000000000000..cc8aef1779b46 --- /dev/null +++ b/pandas/tests/window/test_numba.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.filterwarnings("ignore:\\nThe keyword argument") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +class TestApply: + @pytest.mark.parametrize("jit", [True, False]) + def test_numba_vs_cython(self, jit, nogil, parallel, nopython): + def f(x, *args): + arg_sum = 0 + for arg in args: + arg_sum += arg + return np.mean(x) + arg_sum + + if jit: + import numba + + f = numba.jit(f) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + args = (2,) + + s = Series(range(10)) + result = s.rolling(2).apply( + f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("jit", [True, False]) + def test_cache(self, jit, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + def func_1(x): + return np.mean(x) + 4 + + def func_2(x): + return np.std(x) * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + roll = Series(range(10)).rolling(2) + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + + # func_1 should be in the cache now + assert func_1 in roll._numba_func_cache + + result = roll.apply( + func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_2, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + # This run should use the cached func_1 + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 56d89e15c418c..717273cff64ea 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -3,8 +3,8 @@ import pytest from pandas import DataFrame, Series -from pandas.core.sorting import safe_sort -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.algorithms import safe_sort class TestPairwise: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 70ba85120af3c..04fab93b71c4a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import datetime, timedelta import numpy as np import pytest @@ -7,10 +7,10 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, Index, Series +import pandas._testing as tm from pandas.core.window import Rolling from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestRolling(Base): @@ -361,3 +361,68 @@ def test_rolling_datetime(self, axis_frame, tz_naive_fixture): } ) tm.assert_frame_equal(result, expected) + + +def test_rolling_window_as_string(): + # see gh-22590 + date_today = datetime.now() + days = pd.date_range(date_today, date_today + timedelta(365), freq="D") + + npr = np.random.RandomState(seed=421) + + data = npr.randint(1, high=100, size=len(days)) + df = DataFrame({"DateCol": days, "metric": data}) + + df.set_index("DateCol", inplace=True) + result = df.rolling(window="21D", min_periods=2, closed="left")["metric"].agg("max") + + expData = ( + [np.nan] * 2 + + [88.0] * 16 + + [97.0] * 9 + + [98.0] + + [99.0] * 21 + + [95.0] * 16 + + [93.0] * 5 + + [89.0] * 5 + + [96.0] * 21 + + [94.0] * 14 + + [90.0] * 13 + + [88.0] * 2 + + [90.0] * 9 + + [96.0] * 21 + + [95.0] * 6 + + [91.0] + + [87.0] * 6 + + [92.0] * 21 + + [83.0] * 2 + + [86.0] * 10 + + [87.0] * 5 + + [98.0] * 21 + + [97.0] * 14 + + [93.0] * 7 + + [87.0] * 4 + + [86.0] * 4 + + [95.0] * 21 + + [85.0] * 14 + + [83.0] * 2 + + [76.0] * 5 + + [81.0] * 2 + + [98.0] * 21 + + [95.0] * 14 + + [91.0] * 7 + + [86.0] + + [93.0] * 3 + + [95.0] * 20 + ) + + expected = Series(expData, index=Index(days, name="DateCol"), name="metric") + tm.assert_series_equal(result, expected) + + +def test_min_periods1(): + # GH#6795 + df = pd.DataFrame([0, 1, 2, 1, 0], columns=["a"]) + result = df["a"].rolling(3, center=True, min_periods=1).max() + expected = pd.Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 7055e5b538bea..5f5e10b5dd497 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -10,7 +10,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets @@ -535,25 +535,36 @@ def test_ragged_max(self): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - def test_ragged_apply(self, raw): + @pytest.mark.parametrize( + "freq, op, result_data", + [ + ("ms", "min", [0.0] * 10), + ("ms", "mean", [0.0] * 9 + [2.0 / 9]), + ("ms", "max", [0.0] * 9 + [2.0]), + ("s", "min", [0.0] * 10), + ("s", "mean", [0.0] * 9 + [2.0 / 9]), + ("s", "max", [0.0] * 9 + [2.0]), + ("min", "min", [0.0] * 10), + ("min", "mean", [0.0] * 9 + [2.0 / 9]), + ("min", "max", [0.0] * 9 + [2.0]), + ("h", "min", [0.0] * 10), + ("h", "mean", [0.0] * 9 + [2.0 / 9]), + ("h", "max", [0.0] * 9 + [2.0]), + ("D", "min", [0.0] * 10), + ("D", "mean", [0.0] * 9 + [2.0 / 9]), + ("D", "max", [0.0] * 9 + [2.0]), + ], + ) + def test_freqs_ops(self, freq, op, result_data): + # GH 21096 + index = date_range(start="2018-1-1 01:00:00", freq=f"1{freq}", periods=10) + s = Series(data=0, index=index) + s.iloc[1] = np.nan + s.iloc[-1] = 2 + result = getattr(s.rolling(window=f"10{freq}"), op)() + expected = Series(data=result_data, index=index) - df = self.ragged - - f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) + tm.assert_series_equal(result, expected) def test_all(self): @@ -583,16 +594,6 @@ def test_all(self): expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - def test_all_apply(self, raw): - - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - result = r.apply(lambda x: 1, raw=raw) - expected = er.apply(lambda x: 1, raw=raw) - tm.assert_frame_equal(result, expected) - def test_all2(self): # more sophisticated comparison of integer vs. diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index f42c507e51511..cc29ab4f2cd62 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -60,12 +60,12 @@ def test_numpy_compat(self, method): getattr(w, method)(dtype=np.float64) @td.skip_if_no_scipy - @pytest.mark.parametrize("arg", ["median", "var", "std", "kurt", "skew"]) + @pytest.mark.parametrize("arg", ["median", "kurt", "skew"]) def test_agg_function_support(self, arg): df = pd.DataFrame({"A": np.arange(5)}) roll = df.rolling(2, win_type="triang") - msg = "'{arg}' is not a valid function for 'Window' object".format(arg=arg) + msg = f"'{arg}' is not a valid function for 'Window' object" with pytest.raises(AttributeError, match=msg): roll.agg(arg) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py deleted file mode 100644 index c2b76188ad36b..0000000000000 --- a/pandas/tseries/converter.py +++ /dev/null @@ -1,32 +0,0 @@ -# flake8: noqa -import warnings - -# TODO `_matplotlib` module should be private, so the plotting backend -# can be change. Decide whether all these should be public and exponsed -# in `pandas.plotting`, or remove from here (I guess they are here for -# legacy reasons -from pandas.plotting._matplotlib.converter import ( - DatetimeConverter, - MilliSecondLocator, - PandasAutoDateFormatter, - PandasAutoDateLocator, - PeriodConverter, - TimeConverter, - TimeFormatter, - TimeSeries_DateFormatter, - TimeSeries_DateLocator, - get_datevalue, - get_finder, - time2num, -) - - -def register(): - from pandas.plotting import register_matplotlib_converters - - msg = ( - "'pandas.tseries.converter.register' has been moved and renamed to " - "'pandas.plotting.register_matplotlib_converters'. " - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - register_matplotlib_converters() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 0dcd8aeb4df9b..e2d007cd2d7f8 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,6 +1,7 @@ from datetime import timedelta import re -from typing import Dict +from typing import Dict, Optional +import warnings import numpy as np from pytz import AmbiguousTimeError @@ -49,11 +50,13 @@ # Offset names ("time rules") and related functions #: cache of previously seen offsets -_offset_map = {} # type: Dict[str, DateOffset] +_offset_map: Dict[str, DateOffset] = {} -def get_period_alias(offset_str): - """ alias to closest period strings BQ->Q etc""" +def get_period_alias(offset_str: str) -> Optional[str]: + """ + Alias to closest period strings BQ->Q etc. + """ return _offset_to_period_map.get(offset_str, None) @@ -68,10 +71,10 @@ def get_period_alias(offset_str): } -def to_offset(freq): +def to_offset(freq) -> Optional[DateOffset]: """ Return DateOffset object from string or tuple representation - or datetime.timedelta object + or datetime.timedelta object. Parameters ---------- @@ -123,7 +126,7 @@ def to_offset(freq): if isinstance(stride, str): name, stride = stride, name name, _ = libfreqs._base_and_stride(name) - delta = get_offset(name) * stride + delta = _get_offset(name) * stride elif isinstance(freq, timedelta): delta = None @@ -164,7 +167,7 @@ def to_offset(freq): float(stride), prefix ) stride = int(stride) - offset = get_offset(name) + offset = _get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset @@ -179,14 +182,33 @@ def to_offset(freq): return delta -def get_offset(name): +def get_offset(name: str) -> DateOffset: """ - Return DateOffset object associated with rule name + Return DateOffset object associated with rule name. + + .. deprecated:: 1.0.0 Examples -------- get_offset('EOM') --> BMonthEnd(1) """ + warnings.warn( + "get_offset is deprecated and will be removed in a future version, " + "use to_offset instead", + FutureWarning, + stacklevel=2, + ) + return _get_offset(name) + + +def _get_offset(name: str) -> DateOffset: + """ + Return DateOffset object associated with rule name. + + Examples + -------- + _get_offset('EOM') --> BMonthEnd(1) + """ if name not in libfreqs._dont_uppercase: name = name.upper() name = libfreqs._lite_rule_alias.get(name, name) @@ -214,7 +236,7 @@ def get_offset(name): # Period codes -def infer_freq(index, warn=True): +def infer_freq(index, warn: bool = True) -> Optional[str]: """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed. @@ -222,7 +244,7 @@ def infer_freq(index, warn=True): Parameters ---------- index : DatetimeIndex or TimedeltaIndex - if passed a Series will use the values of the series (NOT THE INDEX) + If passed a Series will use the values of the series (NOT THE INDEX). warn : bool, default True Returns @@ -243,10 +265,11 @@ def infer_freq(index, warn=True): ): raise TypeError( "cannot infer freq from a non-convertible dtype " - "on a Series of {dtype}".format(dtype=index.dtype) + f"on a Series of {index.dtype}" ) index = values + inferer: _FrequencyInferer if is_period_arraylike(index): raise TypeError( "PeriodIndex given. Check the `freq` attribute " @@ -260,8 +283,7 @@ def infer_freq(index, warn=True): if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError( - "cannot infer freq from a non-convertible index " - "type {type}".format(type=type(index)) + f"cannot infer freq from a non-convertible index type {type(index)}" ) index = index.values @@ -280,7 +302,7 @@ class _FrequencyInferer: Not sure if I can avoid the state machine here """ - def __init__(self, index, warn=True): + def __init__(self, index, warn: bool = True): self.index = index self.values = index.asi8 @@ -308,14 +330,14 @@ def deltas_asi8(self): return unique_deltas(self.index.asi8) @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: return len(self.deltas) == 1 @cache_readonly def is_unique_asi8(self): return len(self.deltas_asi8) == 1 - def get_freq(self): + def get_freq(self) -> Optional[str]: """ Find the appropriate frequency string to describe the inferred frequency of self.values @@ -388,12 +410,12 @@ def mdiffs(self): def ydiffs(self): return unique_deltas(self.fields["Y"].astype("i8")) - def _infer_daily_rule(self): + def _infer_daily_rule(self) -> Optional[str]: annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] month = MONTH_ALIASES[self.rep_stamp.month] - alias = "{prefix}-{month}".format(prefix=annual_rule, month=month) + alias = f"{annual_rule}-{month}" return _maybe_add_count(alias, nyears) quarterly_rule = self._get_quarterly_rule() @@ -401,7 +423,7 @@ def _infer_daily_rule(self): nquarters = self.mdiffs[0] / 3 mod_dict = {0: 12, 2: 11, 1: 10} month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] - alias = "{prefix}-{month}".format(prefix=quarterly_rule, month=month) + alias = f"{quarterly_rule}-{month}" return _maybe_add_count(alias, nquarters) monthly_rule = self._get_monthly_rule() @@ -413,7 +435,7 @@ def _infer_daily_rule(self): if days % 7 == 0: # Weekly day = int_to_weekday[self.rep_stamp.weekday()] - return _maybe_add_count("W-{day}".format(day=day), days / 7) + return _maybe_add_count(f"W-{day}", days / 7) else: return _maybe_add_count("D", days) @@ -424,7 +446,9 @@ def _infer_daily_rule(self): if wom_rule: return wom_rule - def _get_annual_rule(self): + return None + + def _get_annual_rule(self) -> Optional[str]: if len(self.ydiffs) > 1: return None @@ -434,7 +458,7 @@ def _get_annual_rule(self): pos_check = self.month_position_check() return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) - def _get_quarterly_rule(self): + def _get_quarterly_rule(self) -> Optional[str]: if len(self.mdiffs) > 1: return None @@ -444,13 +468,13 @@ def _get_quarterly_rule(self): pos_check = self.month_position_check() return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check) - def _get_monthly_rule(self): + def _get_monthly_rule(self) -> Optional[str]: if len(self.mdiffs) > 1: return None pos_check = self.month_position_check() return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) - def _is_business_daily(self): + def _is_business_daily(self) -> bool: # quick check: cannot be business daily if self.day_deltas != [1, 3]: return False @@ -465,7 +489,7 @@ def _is_business_daily(self): | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)) ) - def _get_wom_rule(self): + def _get_wom_rule(self) -> Optional[str]: # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): @@ -485,7 +509,7 @@ def _get_wom_rule(self): week = week_of_months[0] + 1 wd = int_to_weekday[weekdays[0]] - return "WOM-{week}{weekday}".format(week=week, weekday=wd) + return f"WOM-{week}{wd}" class _TimedeltaFrequencyInferer(_FrequencyInferer): @@ -495,20 +519,20 @@ def _infer_daily_rule(self): if days % 7 == 0: # Weekly wd = int_to_weekday[self.rep_stamp.weekday()] - alias = "W-{weekday}".format(weekday=wd) + alias = f"W-{wd}" return _maybe_add_count(alias, days / 7) else: return _maybe_add_count("D", days) -def _is_multiple(us, mult): +def _is_multiple(us, mult: int) -> bool: return us % mult == 0 -def _maybe_add_count(base, count): +def _maybe_add_count(base: str, count: float) -> str: if count != 1: assert count == int(count) count = int(count) - return "{count}{base}".format(count=count, base=base) + return f"{count}{base}" else: return base diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index eb8600031439f..62d7c26b590cc 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -183,19 +183,19 @@ class from pandas.tseries.offsets assert days_of_week is None or type(days_of_week) == tuple self.days_of_week = days_of_week - def __repr__(self): + def __repr__(self) -> str: info = "" if self.year is not None: - info += "year={year}, ".format(year=self.year) - info += "month={mon}, day={day}, ".format(mon=self.month, day=self.day) + info += f"year={self.year}, " + info += f"month={self.month}, day={self.day}, " if self.offset is not None: - info += "offset={offset}".format(offset=self.offset) + info += f"offset={self.offset}" if self.observance is not None: - info += "observance={obs}".format(obs=self.observance) + info += f"observance={self.observance}" - repr = "Holiday: {name} ({info})".format(name=self.name, info=info) + repr = f"Holiday: {self.name} ({info})" return repr def dates(self, start_date, end_date, return_name=False): @@ -344,7 +344,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): Abstract interface to create holidays following certain rules. """ - rules = [] # type: List[Holiday] + rules: List[Holiday] = [] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2200, 12, 31)) _cache = None @@ -363,7 +363,7 @@ def __init__(self, name=None, rules=None): """ super().__init__() if name is None: - name = self.__class__.__name__ + name = type(self).__name__ self.name = name if rules is not None: @@ -394,8 +394,7 @@ def holidays(self, start=None, end=None, return_name=False): """ if self.rules is None: raise Exception( - "Holiday Calendar {name} does not have any " - "rules specified".format(name=self.name) + f"Holiday Calendar {self.name} does not have any rules specified" ) if start is None: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 1e3f5c1ed870e..8bb98a271bce8 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1,7 +1,8 @@ from datetime import date, datetime, timedelta import functools import operator -from typing import Optional +from typing import Any, Optional +import warnings from dateutil.easter import easter import numpy as np @@ -36,8 +37,6 @@ from pandas.core.dtypes.inference import is_list_like -from pandas.core.tools.datetimes import to_datetime - __all__ = [ "Day", "BusinessDay", @@ -167,7 +166,7 @@ class DateOffset(BaseOffset): that conform to the DateOffset. For example, Bday defines this set to be the set of dates that are weekdays (M-F). To test if a date is in the set of a DateOffset dateOffset we can use the - onOffset method: dateOffset.onOffset(date). + is_on_offset method: dateOffset.is_on_offset(date). If a date is not on a valid date, the rollback and rollforward methods can be used to roll the date to the nearest valid date @@ -253,6 +252,7 @@ def __add__(date): _use_relativedelta = False _adjust_dst = False _attributes = frozenset(["n", "normalize"] + list(liboffsets.relativedelta_kwds)) + _deprecations = frozenset(["isAnchored", "onOffset"]) # default for prior pickles normalize = False @@ -311,9 +311,8 @@ def apply_index(self, i): if type(self) is not DateOffset: raise NotImplementedError( - "DateOffset subclass {name} " - "does not have a vectorized " - "implementation".format(name=self.__class__.__name__) + f"DateOffset subclass {type(self).__name__} " + "does not have a vectorized implementation" ) kwds = self.kwds relativedelta_fast = { @@ -362,15 +361,31 @@ def apply_index(self, i): kwd = set(kwds) - relativedelta_fast raise NotImplementedError( "DateOffset with relativedelta " - "keyword(s) {kwd} not able to be " - "applied vectorized".format(kwd=kwd) + f"keyword(s) {kwd} not able to be " + "applied vectorized" ) - def isAnchored(self): + def is_anchored(self): # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what isAnchored means. + # if there were a canonical docstring for what is_anchored means. return self.n == 1 + def onOffset(self, dt): + warnings.warn( + "onOffset is a deprecated, use is_on_offset instead", + FutureWarning, + stacklevel=2, + ) + return self.is_on_offset(dt) + + def isAnchored(self): + warnings.warn( + "isAnchored is a deprecated, use is_anchored instead", + FutureWarning, + stacklevel=2, + ) + return self.is_anchored() + # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` @@ -382,7 +397,7 @@ def _repr_attrs(self): continue elif attr not in exclude: value = getattr(self, attr) - attrs.append("{attr}={value}".format(attr=attr, value=value)) + attrs.append(f"{attr}={value}") out = "" if attrs: @@ -403,8 +418,8 @@ def rollback(self, dt): Rolled timestamp if not on offset, otherwise unchanged timestamp. """ dt = as_timestamp(dt) - if not self.onOffset(dt): - dt = dt - self.__class__(1, normalize=self.normalize, **self.kwds) + if not self.is_on_offset(dt): + dt = dt - type(self)(1, normalize=self.normalize, **self.kwds) return dt def rollforward(self, dt): @@ -417,11 +432,11 @@ def rollforward(self, dt): Rolled timestamp if not on offset, otherwise unchanged timestamp. """ dt = as_timestamp(dt) - if not self.onOffset(dt): - dt = dt + self.__class__(1, normalize=self.normalize, **self.kwds) + if not self.is_on_offset(dt): + dt = dt + type(self)(1, normalize=self.normalize, **self.kwds) return dt - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False # XXX, see #1395 @@ -452,7 +467,7 @@ def freqstr(self): return repr(self) if self.n != 1: - fstr = "{n}{code}".format(n=self.n, code=code) + fstr = f"{self.n}{code}" else: fstr = code @@ -470,7 +485,7 @@ def _offset_str(self): @property def nanos(self): - raise ValueError("{name} is a non-fixed frequency".format(name=self)) + raise ValueError(f"{self} is a non-fixed frequency") class SingleConstructorOffset(DateOffset): @@ -478,7 +493,7 @@ class SingleConstructorOffset(DateOffset): def _from_name(cls, suffix=None): # default _from_name calls cls with no args if suffix: - raise ValueError("Bad freq suffix {suffix}".format(suffix=suffix)) + raise ValueError(f"Bad freq suffix {suffix}") return cls() @@ -516,7 +531,7 @@ def offset(self): def _repr_attrs(self): if self.offset: - attrs = ["offset={offset!r}".format(offset=self.offset)] + attrs = [f"offset={repr(self.offset)}"] else: attrs = None out = "" @@ -634,7 +649,7 @@ def apply_index(self, i): result = shifted.to_timestamp() + time return result - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.weekday() < 5 @@ -731,7 +746,7 @@ def _next_opening_time(self, other, sign=1): earliest_start = self.start[0] latest_start = self.start[-1] - if not self.next_bday.onOffset(other): + if not self.next_bday.is_on_offset(other): # today is not business day other = other + sign * self.next_bday if self.n * sign >= 0: @@ -798,7 +813,7 @@ def rollback(self, dt): """ Roll provided date backward to next offset only if not on offset. """ - if not self.onOffset(dt): + if not self.is_on_offset(dt): if self.n >= 0: dt = self._prev_opening_time(dt) else: @@ -811,7 +826,7 @@ def rollforward(self, dt): """ Roll provided date forward to next offset only if not on offset. """ - if not self.onOffset(dt): + if not self.is_on_offset(dt): if self.n >= 0: return self._next_opening_time(dt) else: @@ -859,13 +874,13 @@ def apply(self, other): # adjust other to reduce number of cases to handle if n >= 0: - if other.time() in self.end or not self._onOffset(other): + if other.time() in self.end or not self._is_on_offset(other): other = self._next_opening_time(other) else: if other.time() in self.start: # adjustment to move to previous business day other = other - timedelta(seconds=1) - if not self._onOffset(other): + if not self._is_on_offset(other): other = self._next_opening_time(other) other = self._get_closing_time(other) @@ -881,9 +896,17 @@ def apply(self, other): # adjust by business days first if bd != 0: - skip_bd = BusinessDay(n=bd) + if isinstance(self, _CustomMixin): # GH 30593 + skip_bd = CustomBusinessDay( + n=bd, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) + else: + skip_bd = BusinessDay(n=bd) # midnight business hour may not on BusinessDay - if not self.next_bday.onOffset(other): + if not self.next_bday.is_on_offset(other): prev_open = self._prev_opening_time(other) remain = other - prev_open other = prev_open + skip_bd + remain @@ -932,7 +955,7 @@ def apply(self, other): else: raise ApplyTypeError("Only know how to combine business hour with datetime") - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False @@ -942,9 +965,9 @@ def onOffset(self, dt): ) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time - return self._onOffset(dt) + return self._is_on_offset(dt) - def _onOffset(self, dt): + def _is_on_offset(self, dt): """ Slight speedups using calculated values. """ @@ -969,10 +992,10 @@ def _onOffset(self, dt): def _repr_attrs(self): out = super()._repr_attrs() hours = ",".join( - "{}-{}".format(st.strftime("%H:%M"), en.strftime("%H:%M")) + f'{st.strftime("%H:%M")}-{en.strftime("%H:%M")}' for st, en in zip(self.start, self.end) ) - attrs = ["{prefix}={hours}".format(prefix=self._prefix, hours=hours)] + attrs = [f"{self._prefix}={hours}"] out += ": " + ", ".join(attrs) return out @@ -1064,7 +1087,7 @@ def apply(self, other): def apply_index(self, i): raise NotImplementedError - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False day64 = _to_dt64(dt, "datetime64[D]") @@ -1112,13 +1135,13 @@ class MonthOffset(SingleConstructorOffset): @property def name(self): - if self.isAnchored: + if self.is_anchored: return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] - return "{code}-{month}".format(code=self.rule_code, month=month) + return f"{self.code_rule}-{month}" - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1200,7 +1223,7 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] ) - onOffset = DateOffset.onOffset # override MonthOffset method + is_on_offset = DateOffset.is_on_offset # override MonthOffset method apply_index = DateOffset.apply_index # override MonthOffset method def __init__( @@ -1299,9 +1322,10 @@ def __init__(self, n=1, normalize=False, day_of_month=None): else: object.__setattr__(self, "day_of_month", int(day_of_month)) if not self._min_day_of_month <= self.day_of_month <= 27: - msg = "day_of_month must be {min}<=day_of_month<=27, got {day}" raise ValueError( - msg.format(min=self._min_day_of_month, day=self.day_of_month) + "day_of_month must be " + f"{self._min_day_of_month}<=day_of_month<=27, " + f"got {self.day_of_month}" ) @classmethod @@ -1310,7 +1334,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - suffix = "-{day_of_month}".format(day_of_month=self.day_of_month) + suffix = f"-{self.day_of_month}" return self._prefix + suffix @apply_wraps @@ -1405,7 +1429,7 @@ class SemiMonthEnd(SemiMonthOffset): _prefix = "SM" _min_day_of_month = 1 - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) @@ -1463,7 +1487,7 @@ class SemiMonthBegin(SemiMonthOffset): _prefix = "SMS" - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.day in (1, self.day_of_month) @@ -1530,11 +1554,9 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday is not None: if self.weekday < 0 or self.weekday > 6: - raise ValueError( - "Day must be 0<=day<=6, got {day}".format(day=self.weekday) - ) + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def isAnchored(self): + def is_anchored(self): return self.n == 1 and self.weekday is not None @apply_wraps @@ -1544,9 +1566,7 @@ def apply(self, other): if not isinstance(other, datetime): raise TypeError( - "Cannot add {typ} to {cls}".format( - typ=type(other).__name__, cls=type(self).__name__ - ) + f"Cannot add {type(other).__name__} to {type(self).__name__}" ) k = self.n @@ -1612,7 +1632,7 @@ def _end_apply_index(self, dtindex): return base + off + Timedelta(1, "ns") - Timedelta(1, "D") - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False elif self.weekday is None: @@ -1624,7 +1644,7 @@ def rule_code(self): suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] - suffix = "-{weekday}".format(weekday=weekday) + suffix = f"-{weekday}" return self._prefix + suffix @classmethod @@ -1655,7 +1675,7 @@ def apply(self, other): to_day = self._get_offset_day(shifted) return liboffsets.shift_day(shifted, to_day - shifted.day) - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1693,13 +1713,9 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): object.__setattr__(self, "week", week) if self.weekday < 0 or self.weekday > 6: - raise ValueError( - "Day must be 0<=day<=6, got {day}".format(day=self.weekday) - ) + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") if self.week < 0 or self.week > 3: - raise ValueError( - "Week must be 0<=week<=3, got {week}".format(week=self.week) - ) + raise ValueError(f"Week must be 0<=week<=3, got {self.week}") def _get_offset_day(self, other): """ @@ -1722,16 +1738,12 @@ def _get_offset_day(self, other): @property def rule_code(self): weekday = ccalendar.int_to_weekday.get(self.weekday, "") - return "{prefix}-{week}{weekday}".format( - prefix=self._prefix, week=self.week + 1, weekday=weekday - ) + return f"{self._prefix}-{self.week + 1}{weekday}" @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError( - "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) - ) + raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.") # TODO: handle n here... # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) week = int(suffix[0]) - 1 @@ -1771,9 +1783,7 @@ def __init__(self, n=1, normalize=False, weekday=0): raise ValueError("N cannot be 0") if self.weekday < 0 or self.weekday > 6: - raise ValueError( - "Day must be 0<=day<=6, got {day}".format(day=self.weekday) - ) + raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") def _get_offset_day(self, other): """ @@ -1797,14 +1807,12 @@ def _get_offset_day(self, other): @property def rule_code(self): weekday = ccalendar.int_to_weekday.get(self.weekday, "") - return "{prefix}-{weekday}".format(prefix=self._prefix, weekday=weekday) + return f"{self._prefix}-{weekday}" @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError( - "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) - ) + raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.") # TODO: handle n here... weekday = ccalendar.weekday_to_int[suffix] return cls(weekday=weekday) @@ -1819,12 +1827,12 @@ class QuarterOffset(DateOffset): Quarter representation - doesn't call super. """ - _default_startingMonth = None # type: Optional[int] - _from_name_startingMonth = None # type: Optional[int] + _default_startingMonth: Optional[int] = None + _from_name_startingMonth: Optional[int] = None _adjust_dst = True _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some - # point. Also apply_index, onOffset, rule_code if + # point. Also apply_index, is_on_offset, rule_code if # startingMonth vs month attr names are resolved def __init__(self, n=1, normalize=False, startingMonth=None): @@ -1834,7 +1842,7 @@ def __init__(self, n=1, normalize=False, startingMonth=None): startingMonth = self._default_startingMonth object.__setattr__(self, "startingMonth", startingMonth) - def isAnchored(self): + def is_anchored(self): return self.n == 1 and self.startingMonth is not None @classmethod @@ -1850,13 +1858,13 @@ def _from_name(cls, suffix=None): @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.startingMonth] - return "{prefix}-{month}".format(prefix=self._prefix, month=month) + return f"{self._prefix}-{month}" @apply_wraps def apply(self, other): # months_since: find the calendar quarter containing other.month, # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep]. - # Then find the month in that quarter containing an onOffset date for + # Then find the month in that quarter containing an is_on_offset date for # self. `months_since` is the number of months to shift other.month # to get to this on-offset month. months_since = other.month % 3 - self.startingMonth % 3 @@ -1866,7 +1874,7 @@ def apply(self, other): months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False mod_month = (dt.month - self.startingMonth) % 3 @@ -1969,7 +1977,7 @@ def apply_index(self, dtindex): shifted, freq=dtindex.freq, dtype=dtindex.dtype ) - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.month == self.month and dt.day == self._get_offset_day(dt) @@ -1993,7 +2001,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.month] - return "{prefix}-{month}".format(prefix=self._prefix, month=month) + return f"{self._prefix}-{month}" class BYearEnd(YearOffset): @@ -2107,16 +2115,14 @@ def __init__( raise ValueError("N cannot be 0") if self.variation not in ["nearest", "last"]: - raise ValueError( - "{variation} is not a valid variation".format(variation=self.variation) - ) + raise ValueError(f"{self.variation} is not a valid variation") - def isAnchored(self): + def is_anchored(self): return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False dt = datetime(dt.year, dt.month, dt.day) @@ -2214,7 +2220,7 @@ def get_year_end(self, dt): def rule_code(self): prefix = self._prefix suffix = self.get_rule_code_suffix() - return "{prefix}-{suffix}".format(prefix=prefix, suffix=suffix) + return f"{prefix}-{suffix}" def _get_suffix_prefix(self): if self.variation == "nearest": @@ -2226,9 +2232,7 @@ def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] - return "{prefix}-{month}-{weekday}".format( - prefix=prefix, month=month, weekday=weekday - ) + return f"{prefix}-{month}-{weekday}" @classmethod def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): @@ -2237,9 +2241,7 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): elif varion_code == "L": variation = "last" else: - raise ValueError( - "Unable to parse varion_code: {code}".format(code=varion_code) - ) + raise ValueError(f"Unable to parse varion_code: {varion_code}") startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] weekday = ccalendar.weekday_to_int[weekday_code] @@ -2344,8 +2346,8 @@ def _offset(self): variation=self.variation, ) - def isAnchored(self): - return self.n == 1 and self._offset.isAnchored() + def is_anchored(self): + return self.n == 1 and self._offset.is_anchored() def _rollback_to_year(self, other): """ @@ -2371,7 +2373,7 @@ def _rollback_to_year(self, other): norm = Timestamp(other).tz_localize(None) start = self._offset.rollback(norm) - # Note: start <= norm and self._offset.onOffset(start) + # Note: start <= norm and self._offset.is_on_offset(start) if start < norm: # roll adjustment @@ -2379,7 +2381,7 @@ def _rollback_to_year(self, other): # check thet qtr_lens is consistent with self._offset addition end = liboffsets.shift_day(start, days=7 * sum(qtr_lens)) - assert self._offset.onOffset(end), (start, end, qtr_lens) + assert self._offset.is_on_offset(end), (start, end, qtr_lens) tdelta = norm - start for qlen in qtr_lens: @@ -2443,10 +2445,10 @@ def year_has_extra_week(self, dt): assert weeks_in_year in [52, 53], weeks_in_year return weeks_in_year == 53 - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False - if self._offset.onOffset(dt): + if self._offset.is_on_offset(dt): return True next_year_end = dt - self._offset @@ -2464,9 +2466,7 @@ def onOffset(self, dt): def rule_code(self): suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week - return "{prefix}-{suffix}-{qtr}".format( - prefix=self._prefix, suffix=suffix, qtr=qtr - ) + return f"{self._prefix}-{suffix}-{qtr}" @classmethod def _from_name(cls, *args): @@ -2516,7 +2516,7 @@ def apply(self, other): ) return new - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) @@ -2535,12 +2535,11 @@ def f(self, other): except AttributeError: # comparing with a non-Tick object raise TypeError( - "Invalid comparison between {cls} and {typ}".format( - cls=type(self).__name__, typ=type(other).__name__ - ) + f"Invalid comparison between {type(self).__name__} " + f"and {type(other).__name__}" ) - f.__name__ = "__{opname}__".format(opname=op.__name__) + f.__name__ = f"__{op.__name__}__" return f @@ -2575,11 +2574,10 @@ def __add__(self, other): return NotImplemented except OverflowError: raise OverflowError( - "the add operation between {self} and {other} " - "will overflow".format(self=self, other=other) + f"the add operation between {self} and {other} will overflow" ) - def __eq__(self, other): + def __eq__(self, other: Any) -> bool: if isinstance(other, str): from pandas.tseries.frequencies import to_offset @@ -2648,11 +2646,9 @@ def apply(self, other): elif isinstance(other, type(self)): return type(self)(self.n + other.n) - raise ApplyTypeError( - "Unhandled type: {type_str}".format(type_str=type(other).__name__) - ) + raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") - def isAnchored(self): + def is_anchored(self): return False @@ -2752,13 +2748,15 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): offset = to_offset(offset) - start = to_datetime(start) - end = to_datetime(end) + start = Timestamp(start) + start = start if start is not NaT else None + end = Timestamp(end) + end = end if end is not NaT else None - if start and not offset.onOffset(start): + if start and not offset.is_on_offset(start): start = offset.rollforward(start) - elif end and not offset.onOffset(end): + elif end and not offset.is_on_offset(end): end = offset.rollback(end) if periods is None and end < start and offset.n >= 0: @@ -2784,9 +2782,7 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: - raise ValueError( - "Offset {offset} did not increment date".format(offset=offset) - ) + raise ValueError(f"Offset {offset} did not increment date") cur = next_date else: while cur >= end: @@ -2800,9 +2796,7 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: - raise ValueError( - "Offset {offset} did not decrement date".format(offset=offset) - ) + raise ValueError(f"Offset {offset} did not decrement date") cur = next_date diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py deleted file mode 100644 index df41b4b5b40d9..0000000000000 --- a/pandas/tseries/plotting.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from pandas.plotting._matplotlib.timeseries import tsplot diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index f8c08ed8c099f..d10d3a1f71fe6 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -58,7 +58,7 @@ def deprecate( alt_name = alt_name or alternative.__name__ klass = klass or FutureWarning - warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) + warning_msg = msg or f"{name} is deprecated, use {alt_name} instead" @wraps(alternative) def wrapper(*args, **kwargs) -> Callable[..., Any]: @@ -66,12 +66,12 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: return alternative(*args, **kwargs) # adding deprecated directive to the docstring - msg = msg or "Use `{alt_name}` instead.".format(alt_name=alt_name) + msg = msg or f"Use `{alt_name}` instead." doc_error_msg = ( "deprecate needs a correctly formatted docstring in " "the target function (should have a one liner short " "summary, and opening quotes should be in their own " - "line). Found:\n{}".format(alternative.__doc__) + f"line). Found:\n{alternative.__doc__}" ) # when python is running in optimized mode (i.e. `-OO`), docstrings are @@ -84,18 +84,13 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if empty1 or empty2 and not summary: raise AssertionError(doc_error_msg) wrapper.__doc__ = dedent( - """ - {summary} - - .. deprecated:: {depr_version} - {depr_msg} - - {rest_of_docstring}""" - ).format( - summary=summary.strip(), - depr_version=version, - depr_msg=msg, - rest_of_docstring=dedent(doc), + f""" + {summary.strip()} + + .. deprecated:: {version} + {msg} + + {dedent(doc)}""" ) return wrapper @@ -182,10 +177,10 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: if old_arg_value is not None: if new_arg_name is None: msg = ( - "the '{old_name}' keyword is deprecated and will be " - "removed in a future version. " - "Please take steps to stop the use of '{old_name}'" - ).format(old_name=old_arg_name) + f"the {repr(old_arg_name)} keyword is deprecated and " + "will be removed in a future version. Please take " + f"steps to stop the use of {repr(old_arg_name)}" + ) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) kwargs[old_arg_name] = old_arg_value return func(*args, **kwargs) @@ -196,26 +191,23 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: else: new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( - "the {old_name}={old_val!r} keyword is deprecated, " - "use {new_name}={new_val!r} instead" - ).format( - old_name=old_arg_name, - old_val=old_arg_value, - new_name=new_arg_name, - new_val=new_arg_value, + f"the {old_arg_name}={repr(old_arg_value)} keyword is " + "deprecated, use " + f"{new_arg_name}={repr(new_arg_value)} instead" ) else: new_arg_value = old_arg_value msg = ( - "the '{old_name}' keyword is deprecated, " - "use '{new_name}' instead" - ).format(old_name=old_arg_name, new_name=new_arg_name) + f"the {repr(old_arg_name)}' keyword is deprecated, " + f"use {repr(new_arg_name)} instead" + ) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) if kwargs.get(new_arg_name) is not None: msg = ( - "Can only specify '{old_name}' or '{new_name}', not both" - ).format(old_name=old_arg_name, new_name=new_arg_name) + f"Can only specify {repr(old_arg_name)} " + f"or {repr(new_arg_name)}, not both" + ) raise TypeError(msg) else: kwargs[new_arg_name] = new_arg_value @@ -302,7 +294,6 @@ def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. """ - if isinstance(self.params, dict): self.params.update(*args, **kwargs) @@ -327,9 +318,11 @@ def my_dog(has='fleas'): pass """ + addendum: Optional[str] + def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): if indents > 0: - self.addendum = indent(addendum, indents=indents) # type: Optional[str] + self.addendum = indent(addendum, indents=indents) else: self.addendum = addendum self.join = join diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 54f090ede3fc4..5694ca24aab57 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -4,11 +4,13 @@ """ import importlib +from typing import Iterable import warnings class _DeprecatedModule: - """ Class for mocking deprecated modules. + """ + Class for mocking deprecated modules. Parameters ---------- @@ -32,19 +34,19 @@ def __init__(self, deprmod, deprmodto=None, removals=None, moved=None): self.moved = moved # For introspection purposes. - self.self_dir = frozenset(dir(self.__class__)) + self.self_dir = frozenset(dir(type(self))) - def __dir__(self): + def __dir__(self) -> Iterable[str]: deprmodule = self._import_deprmod() return dir(deprmodule) - def __repr__(self): + def __repr__(self) -> str: deprmodule = self._import_deprmod() return repr(deprmodule) __str__ = __repr__ - def __getattr__(self, name): + def __getattr__(self, name: str): if name in self.self_dir: return object.__getattribute__(self, name) @@ -61,17 +63,15 @@ def __getattr__(self, name): if self.removals is not None and name in self.removals: warnings.warn( - "{deprmod}.{name} is deprecated and will be removed in " - "a future version.".format(deprmod=self.deprmod, name=name), + f"{self.deprmod}.{name} is deprecated and will be removed in " + "a future version.", FutureWarning, stacklevel=2, ) elif self.moved is not None and name in self.moved: warnings.warn( - "{deprmod} is deprecated and will be removed in " - "a future version.\nYou can access {name} as {moved}".format( - deprmod=self.deprmod, name=name, moved=self.moved[name] - ), + f"{self.deprmod} is deprecated and will be removed in " + f"a future version.\nYou can access {name} as {self.moved[name]}", FutureWarning, stacklevel=2, ) @@ -79,8 +79,8 @@ def __getattr__(self, name): deprmodto = self.deprmodto if deprmodto is False: warnings.warn( - "{deprmod}.{name} is deprecated and will be removed in " - "a future version.".format(deprmod=self.deprmod, name=name), + f"{self.deprmod}.{name} is deprecated and will be removed in " + "a future version.", FutureWarning, stacklevel=2, ) @@ -89,10 +89,8 @@ def __getattr__(self, name): deprmodto = obj.__module__ # The object is actually located in another module. warnings.warn( - "{deprmod}.{name} is deprecated. Please use " - "{deprmodto}.{name} instead.".format( - deprmod=self.deprmod, name=name, deprmodto=deprmodto - ), + f"{self.deprmod}.{name} is deprecated. Please use " + f"{deprmodto}.{name} instead.", FutureWarning, stacklevel=2, ) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 11156bc972857..8fd4566d7763b 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np import pandas as pd @@ -9,24 +11,27 @@ class TablePlotter: Used in merging.rst """ - def __init__(self, cell_width=0.37, cell_height=0.25, font_size=7.5): + def __init__( + self, + cell_width: float = 0.37, + cell_height: float = 0.25, + font_size: float = 7.5, + ): self.cell_width = cell_width self.cell_height = cell_height self.font_size = font_size - def _shape(self, df): + def _shape(self, df: pd.DataFrame) -> Tuple[int, int]: """ Calculate table chape considering index levels. """ - row, col = df.shape return row + df.columns.nlevels, col + df.index.nlevels - def _get_cells(self, left, right, vertical): + def _get_cells(self, left, right, vertical) -> Tuple[int, int]: """ Calculate appropriate figure size based on left and right data. """ - if vertical: # calculate required number of cells vcells = max(sum(self._shape(l)[0] for l in left), self._shape(right)[0]) @@ -36,7 +41,7 @@ def _get_cells(self, left, right, vertical): hcells = sum([self._shape(l)[1] for l in left] + [self._shape(right)[1]]) return hcells, vcells - def plot(self, left, right, labels=None, vertical=True): + def plot(self, left, right, labels=None, vertical: bool = True): """ Plot left / right DataFrames in specified layout. @@ -45,7 +50,7 @@ def plot(self, left, right, labels=None, vertical=True): left : list of DataFrames before operation is applied right : DataFrame of operation result labels : list of str to be drawn as titles of left DataFrames - vertical : bool + vertical : bool, default True If True, use vertical layout. If False, use horizontal layout. """ import matplotlib.pyplot as plt @@ -96,7 +101,9 @@ def plot(self, left, right, labels=None, vertical=True): return fig def _conv(self, data): - """Convert each input to appropriate for table outplot""" + """ + Convert each input to appropriate for table outplot. + """ if isinstance(data, pd.Series): if data.name is None: data = data.to_frame(name="") @@ -113,7 +120,7 @@ def _insert_index(self, data): data.insert(0, "Index", data.index) else: for i in range(idx_nlevels): - data.insert(i, "Index{0}".format(i), data.index._get_level_values(i)) + data.insert(i, f"Index{i}", data.index._get_level_values(i)) col_nlevels = data.columns.nlevels if col_nlevels > 1: @@ -127,7 +134,7 @@ def _insert_index(self, data): data.columns = col return data - def _make_table(self, ax, df, title, height=None): + def _make_table(self, ax, df, title: str, height: Optional[float] = None): if df is None: ax.set_visible(False) return diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 4f2cbd4314b8e..0723a37b1ba82 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -1,15 +1,18 @@ import contextlib +from typing import Tuple @contextlib.contextmanager -def rewrite_exception(old_name, new_name): - """Rewrite the message of an exception.""" +def rewrite_exception(old_name: str, new_name: str): + """ + Rewrite the message of an exception. + """ try: yield except Exception as err: msg = err.args[0] msg = msg.replace(old_name, new_name) - args = (msg,) + args: Tuple[str, ...] = (msg,) if len(err.args) > 1: args = args + err.args[1:] err.args = args diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 289a32c51a916..2801a2bf9c371 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -6,14 +6,16 @@ import struct import subprocess import sys +from typing import List, Optional, Tuple, Union from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency -def get_sys_info(): - "Returns system information as a dict" - - blob = [] +def get_sys_info() -> List[Tuple[str, Optional[Union[str, int]]]]: + """ + Returns system information as a list + """ + blob: List[Tuple[str, Optional[Union[str, int]]]] = [] # get full commit hash commit = None @@ -29,12 +31,7 @@ def get_sys_info(): pass else: if pipe.returncode == 0: - commit = so - try: - commit = so.decode("utf-8") - except ValueError: - pass - commit = commit.strip().strip('"') + commit = so.decode("utf-8").strip().strip('"') blob.append(("commit", commit)) @@ -44,14 +41,14 @@ def get_sys_info(): [ ("python", ".".join(map(str, sys.version_info))), ("python-bits", struct.calcsize("P") * 8), - ("OS", "{sysname}".format(sysname=sysname)), - ("OS-release", "{release}".format(release=release)), + ("OS", f"{sysname}"), + ("OS-release", f"{release}"), # ("Version", "{version}".format(version=version)), - ("machine", "{machine}".format(machine=machine)), - ("processor", "{processor}".format(processor=processor)), - ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), - ("LC_ALL", "{lc}".format(lc=os.environ.get("LC_ALL", "None"))), - ("LANG", "{lang}".format(lang=os.environ.get("LANG", "None"))), + ("machine", f"{machine}"), + ("processor", f"{processor}"), + ("byteorder", f"{sys.byteorder}"), + ("LC_ALL", f"{os.environ.get('LC_ALL', 'None')}"), + ("LANG", f"{os.environ.get('LANG', 'None')}"), ("LOCALE", ".".join(map(str, locale.getlocale()))), ] ) @@ -99,6 +96,7 @@ def show_versions(as_json=False): mod = import_optional_dependency( modname, raise_on_missing=False, on_version="ignore" ) + ver: Optional[str] if mod: ver = _get_version(mod) else: @@ -126,7 +124,7 @@ def show_versions(as_json=False): print(tpl.format(k=k, stat=stat)) -def main(): +def main() -> int: from optparse import OptionParser parser = OptionParser() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index b516c3d78a11e..d8804994af426 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -24,6 +24,7 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ from distutils.version import LooseVersion +from functools import wraps import locale from typing import Callable, Optional @@ -31,12 +32,13 @@ def test_foo(): import pytest from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR -def safe_import(mod_name, min_version=None): +def safe_import(mod_name: str, min_version: Optional[str] = None): """ Parameters: ----------- @@ -109,7 +111,7 @@ def _skip_if_not_us_locale(): return True -def _skip_if_no_scipy(): +def _skip_if_no_scipy() -> bool: return not ( safe_import("scipy.stats") and safe_import("scipy.sparse") @@ -128,7 +130,7 @@ def skip_if_installed(package: str) -> Callable: The name of the package. """ return pytest.mark.skipif( - safe_import(package), reason="Skipping because {} is installed.".format(package) + safe_import(package), reason=f"Skipping because {package} is installed." ) @@ -162,9 +164,9 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: a pytest.mark.skipif to use as either a test decorator or a parametrization mark. """ - msg = "Could not import '{}'".format(package) + msg = f"Could not import '{package}'" if min_version: - msg += " satisfying a min_version of {}".format(min_version) + msg += f" satisfying a min_version of {min_version}" return pytest.mark.skipif( not safe_import(package, min_version=min_version), reason=msg ) @@ -180,26 +182,25 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: is_platform_windows(), reason="not used on win32" ) skip_if_has_locale = pytest.mark.skipif( - _skip_if_has_locale(), - reason="Specific locale is set {lang}".format(lang=locale.getlocale()[0]), + _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", ) skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), - reason="Specific locale is set {lang}".format(lang=locale.getlocale()[0]), + _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement" ) skip_if_no_ne = pytest.mark.skipif( not _USE_NUMEXPR, - reason="numexpr enabled->{enabled}, " - "installed->{installed}".format(enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED), + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", ) -def skip_if_np_lt(ver_str, reason=None, *args, **kwds): +def skip_if_np_lt( + ver_str: str, reason: Optional[str] = None, *args, **kwds +) -> Callable: if reason is None: - reason = "NumPy %s or greater required" % ver_str + reason = f"NumPy {ver_str} or greater required" return pytest.mark.skipif( _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds ) @@ -213,14 +214,14 @@ def parametrize_fixture_doc(*args): initial fixture docstring by replacing placeholders {0}, {1} etc with parameters passed as arguments. - Parameters: + Parameters ---------- - args: iterable - Positional arguments for docstring. + args: iterable + Positional arguments for docstring. - Returns: + Returns ------- - documented_fixture: function + function The decorated function wrapped within a pytest ``parametrize_fixture_doc`` mark """ @@ -230,3 +231,34 @@ def documented_fixture(fixture): return fixture return documented_fixture + + +def check_file_leaks(func) -> Callable: + """ + Decorate a test function tot check that we are not leaking file descriptors. + """ + psutil = safe_import("psutil") + if not psutil: + return func + + @wraps(func) + def new_func(*args, **kwargs): + proc = psutil.Process() + flist = proc.open_files() + + func(*args, **kwargs) + + flist2 = proc.open_files() + assert flist2 == flist + + return new_func + + +def async_mark(): + try: + import_optional_dependency("pytest_asyncio") + async_mark = pytest.mark.asyncio + except ImportError: + async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") + + return async_mark diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 0f5324c8d02ba..b299f3790ab22 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -1,5 +1,5 @@ """ -Entrypoint for testing from the top-level namespace +Entrypoint for testing from the top-level namespace. """ import os import sys @@ -11,7 +11,7 @@ def test(extra_args=None): try: import pytest except ImportError: - raise ImportError("Need pytest>=4.0.2 to run tests") + raise ImportError("Need pytest>=5.0.1 to run tests") try: import hypothesis # noqa except ImportError: @@ -22,7 +22,8 @@ def test(extra_args=None): extra_args = [extra_args] cmd = extra_args cmd += [PKG] - print("running: pytest {}".format(" ".join(cmd))) + joined = " ".join(cmd) + print(f"running: pytest {joined}") sys.exit(pytest.main(cmd)) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 0eaf46d563163..b69c974661f89 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -15,7 +15,6 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): Checks whether 'args' has length of at most 'compat_args'. Raises a TypeError if that is not the case, similar to in Python when a function is called with too many arguments. - """ if max_fname_arg_count < 0: raise ValueError("'max_fname_arg_count' must be non-negative") @@ -26,13 +25,8 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): argument = "argument" if max_arg_count == 1 else "arguments" raise TypeError( - "{fname}() takes at most {max_arg} {argument} " - "({given_arg} given)".format( - fname=fname, - max_arg=max_arg_count, - argument=argument, - given_arg=actual_arg_count, - ) + f"{fname}() takes at most {max_arg_count} {argument} " + f"({actual_arg_count} given)" ) @@ -43,7 +37,6 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): Note that this function is to be called only when it has been checked that arg_val_dict.keys() is a subset of compat_args - """ for key in arg_val_dict: # try checking equality directly with '=' operator, @@ -70,11 +63,8 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): if not match: raise ValueError( - ( - "the '{arg}' parameter is not " - "supported in the pandas " - "implementation of {fname}()".format(fname=fname, arg=key) - ) + f"the '{key}' parameter is not supported in " + f"the pandas implementation of {fname}()" ) @@ -84,32 +74,30 @@ def validate_args(fname, args, max_fname_arg_count, compat_args): has at most `len(compat_args)` arguments and whether or not all of these elements in `args` are set to their default values. - fname: str + Parameters + ---------- + fname : str The name of the function being passed the `*args` parameter - - args: tuple + args : tuple The `*args` parameter passed into a function - - max_fname_arg_count: int + max_fname_arg_count : int The maximum number of arguments that the function `fname` can accept, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - - compat_args: OrderedDict - A ordered dictionary of keys and their associated default values. + compat_args : dict + A dictionary of keys and their associated default values. In order to accommodate buggy behaviour in some versions of `numpy`, where a signature displayed keyword arguments but then passed those arguments **positionally** internally when calling downstream - implementations, an ordered dictionary ensures that the original - order of the keyword arguments is enforced. Note that if there is - only one key, a generic dict can be passed in as well. - + implementations, a dict ensures that the original + order of the keyword arguments is enforced. Raises ------ - TypeError if `args` contains more values than there are `compat_args` - ValueError if `args` contains values that do not correspond to those - of the default values specified in `compat_args` - + TypeError + If `args` contains more values than there are `compat_args` + ValueError + If `args` contains values that do not correspond to those + of the default values specified in `compat_args` """ _check_arg_length(fname, args, max_fname_arg_count, compat_args) @@ -124,19 +112,13 @@ def _check_for_invalid_keys(fname, kwargs, compat_args): """ Checks whether 'kwargs' contains any keys that are not in 'compat_args' and raises a TypeError if there is one. - """ # set(dict) --> set of the dictionary's keys diff = set(kwargs) - set(compat_args) if diff: bad_arg = list(diff)[0] - raise TypeError( - ( - "{fname}() got an unexpected " - "keyword argument '{arg}'".format(fname=fname, arg=bad_arg) - ) - ) + raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'") def validate_kwargs(fname, kwargs, compat_args): @@ -147,12 +129,10 @@ def validate_kwargs(fname, kwargs, compat_args): Parameters ---------- - fname: str + fname : str The name of the function being passed the `**kwargs` parameter - - kwargs: dict + kwargs : dict The `**kwargs` parameter passed into `fname` - compat_args: dict A dictionary of keys that `kwargs` is allowed to have and their associated default values @@ -162,7 +142,6 @@ def validate_kwargs(fname, kwargs, compat_args): TypeError if `kwargs` contains keys not in `compat_args` ValueError if `kwargs` contains keys in `compat_args` that do not map to the default values specified in `compat_args` - """ kwds = kwargs.copy() _check_for_invalid_keys(fname, kwargs, compat_args) @@ -179,22 +158,17 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar ---------- fname: str The name of the function being passed the `**kwargs` parameter - args: tuple The `*args` parameter passed into a function - kwargs: dict The `**kwargs` parameter passed into `fname` - max_fname_arg_count: int The minimum number of arguments that the function `fname` requires, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - - compat_args: OrderedDict - A ordered dictionary of keys that `kwargs` is allowed to - have and their associated default values. Note that if there - is only one key, a generic dict can be passed in as well. + compat_args: dict + A dictionary of keys that `kwargs` is allowed to + have and their associated default values. Raises ------ @@ -223,8 +197,7 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar for key in args_dict: if key in kwargs: raise TypeError( - "{fname}() got multiple values for keyword " - "argument '{arg}'".format(fname=fname, arg=key) + f"{fname}() got multiple values for keyword argument '{key}'" ) kwargs.update(args_dict) @@ -235,8 +208,8 @@ def validate_bool_kwarg(value, arg_name): """ Ensures that argument passed in arg_name is of type bool. """ if not (is_bool(value) or value is None): raise ValueError( - 'For argument "{arg}" expected type bool, received ' - "type {typ}.".format(arg=arg_name, typ=type(value).__name__) + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." ) return value @@ -289,9 +262,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): # First fill with explicit values provided by the user... if arg_name in kwargs: if args: - msg = "{} got multiple values for argument '{}'".format( - method_name, arg_name - ) + msg = f"{method_name} got multiple values for argument '{arg_name}'" raise TypeError(msg) axis = data._get_axis_name(kwargs.get("axis", 0)) @@ -332,8 +303,8 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): out[data._AXIS_NAMES[0]] = args[0] out[data._AXIS_NAMES[1]] = args[1] else: - msg = "Cannot specify all of '{}', 'index', 'columns'." - raise TypeError(msg.format(arg_name)) + msg = f"Cannot specify all of '{arg_name}', 'index', 'columns'." + raise TypeError(msg) return out @@ -366,7 +337,7 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): if validate_scalar_dict_value and isinstance(value, (list, tuple)): raise TypeError( '"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__) + f'you passed a "{type(value).__name__}"' ) elif value is not None and method is not None: diff --git a/pandas/util/move.c b/pandas/util/move.c deleted file mode 100644 index 1c29a4c214909..0000000000000 --- a/pandas/util/move.c +++ /dev/null @@ -1,212 +0,0 @@ -/* -Copyright (c) 2019, PyData Development Team -All rights reserved. - -Distributed under the terms of the BSD Simplified License. - -The full license is in the LICENSE file, distributed with this software. -*/ - -#define PY_SSIZE_T_CLEAN -#include - -#ifndef Py_TPFLAGS_HAVE_GETCHARBUFFER -#define Py_TPFLAGS_HAVE_GETCHARBUFFER 0 -#endif // Py_TPFLAGS_HAVE_GETCHARBUFFER - -#ifndef Py_TPFLAGS_HAVE_NEWBUFFER -#define Py_TPFLAGS_HAVE_NEWBUFFER 0 -#endif // Py_TPFLAGS_HAVE_NEWBUFFER - -static PyObject *badmove; /* bad move exception class */ - -typedef struct { - PyObject_HEAD - /* the bytes that own the buffer we are mutating */ - PyObject *invalid_bytes; -} stolenbufobject; - -static PyTypeObject stolenbuf_type; /* forward declare type */ - -static void -stolenbuf_dealloc(stolenbufobject *self) { - Py_DECREF(self->invalid_bytes); - PyObject_Del(self); -} - -static int -stolenbuf_getbuffer(stolenbufobject *self, Py_buffer *view, int flags) { - return PyBuffer_FillInfo(view, - (PyObject*) self, - (void*) PyBytes_AS_STRING(self->invalid_bytes), - PyBytes_GET_SIZE(self->invalid_bytes), - 0, /* not readonly */ - flags); -} - -static PyBufferProcs stolenbuf_as_buffer = { - (getbufferproc) stolenbuf_getbuffer, - NULL, -}; - -PyDoc_STRVAR(stolenbuf_doc, - "A buffer that is wrapping a stolen bytes object's buffer."); - -static PyTypeObject stolenbuf_type = { - PyVarObject_HEAD_INIT(NULL, 0) - "pandas.util._move.stolenbuf", /* tp_name */ - sizeof(stolenbufobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) stolenbuf_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_reserved */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - &stolenbuf_as_buffer, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | - Py_TPFLAGS_HAVE_NEWBUFFER | - Py_TPFLAGS_HAVE_GETCHARBUFFER, /* tp_flags */ - stolenbuf_doc, /* tp_doc */ -}; - -PyDoc_STRVAR( - move_into_mutable_buffer_doc, - "Moves a bytes object that is about to be destroyed into a mutable buffer\n" - "without copying the data.\n" - "\n" - "Parameters\n" - "----------\n" - "bytes_rvalue : bytes with 1 refcount.\n" - " The bytes object that you want to move into a mutable buffer. This\n" - " cannot be a named object. It must only have a single reference.\n" - "\n" - "Returns\n" - "-------\n" - "buf : stolenbuf\n" - " An object that supports the buffer protocol which can give a mutable\n" - " view of the data that was previously owned by ``bytes_rvalue``.\n" - "\n" - "Raises\n" - "------\n" - "BadMove\n" - " Raised when a move is attempted on an object with more than one\n" - " reference.\n" - "\n" - "Notes\n" - "-----\n" - "If you want to use this function you are probably wrong.\n" - "\n" - "Warning: Do not call this function through *unpacking. This can\n" - "potentially trick the reference checks which may allow you to get a\n" - "mutable reference to a shared string!\n" - "\n"); - -/* This is implemented as a standalone function instead of the ``tp_new`` of - ``stolenbuf`` because we need to create a function using the METH_O flag - to support Python 3.6. In python 3.6, PyCFunction calls from python code now - count the reference owned by the argument tuple. This would cause the object - to have 2 references if used with a direct call like: ``stolenbuf(a)``; - however, if called through *unpacking like ``stolenbuf(*(a,))`` it would - only have the one reference (the tuple). */ -static PyObject* -move_into_mutable_buffer(PyObject *self, PyObject *bytes_rvalue) { - stolenbufobject *ret; - - if (!PyBytes_CheckExact(bytes_rvalue)) { - PyErr_SetString(PyExc_TypeError, - "stolenbuf can only steal from bytes objects"); - return NULL; - } - - if (Py_REFCNT(bytes_rvalue) != 1) { - // there is a reference other than the caller's stack - PyErr_SetObject(badmove, bytes_rvalue); - return NULL; - } - - if (!(ret = PyObject_New(stolenbufobject, &stolenbuf_type))) { - return NULL; - } - - /* store the original bytes object in a field that is not - exposed to python */ - Py_INCREF(bytes_rvalue); - ret->invalid_bytes = bytes_rvalue; - return (PyObject*) ret; -} - -static PyMethodDef methods[] = { - {"move_into_mutable_buffer", - (PyCFunction) move_into_mutable_buffer, - METH_O, - move_into_mutable_buffer_doc}, - {NULL}, -}; - -#define MODULE_NAME "pandas.util._move" - -static PyModuleDef move_module = { - PyModuleDef_HEAD_INIT, - MODULE_NAME, - NULL, - -1, - methods, -}; - -PyDoc_STRVAR( - badmove_doc, - "Exception used to indicate that a move was attempted on a value with\n" - "more than a single reference.\n" - "\n" - "Parameters\n" - "----------\n" - "data : any\n" - " The data which was passed to ``move_into_mutable_buffer``.\n" - "\n" - "See Also\n" - "--------\n" - "pandas.util._move.move_into_mutable_buffer\n"); - -PyMODINIT_FUNC -#define ERROR_RETURN NULL -PyInit__move(void) { - PyObject *m; - - if (!(badmove = PyErr_NewExceptionWithDoc("pandas.util._move.BadMove", - badmove_doc, - NULL, - NULL))) { - return ERROR_RETURN; - } - - if (PyType_Ready(&stolenbuf_type)) { - return ERROR_RETURN; - } - - if (!(m = PyModule_Create(&move_module))) { - return ERROR_RETURN; - } - - if (PyModule_AddObject(m, - "stolenbuf", - (PyObject*) &stolenbuf_type)) { - Py_DECREF(m); - return ERROR_RETURN; - } - - if (PyModule_AddObject(m, "BadMove", badmove)) { - Py_DECREF(m); - return ERROR_RETURN; - } - - return m; -} diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f3b0226547c78..af9fe4846b27d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,2787 +1,12 @@ -import bz2 -from collections import Counter -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -import gzip -import os -from shutil import rmtree -import string -import tempfile -from typing import Union, cast import warnings -import zipfile -import numpy as np -from numpy.random import rand, randn +from pandas._testing import * # noqa -from pandas._config.localization import ( # noqa:F401 - can_set_locale, - get_locales, - set_locale, +warnings.warn( + ( + "pandas.util.testing is deprecated. Use the functions in the " + "public API at pandas.testing instead." + ), + FutureWarning, + stacklevel=2, ) - -import pandas._libs.testing as _testing -from pandas.compat import _get_lzma_file, _import_lzma - -from pandas.core.dtypes.common import ( - is_bool, - is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_extension_array_dtype, - is_interval_dtype, - is_list_like, - is_number, - is_period_dtype, - is_sequence, - is_timedelta64_dtype, - needs_i8_conversion, -) -from pandas.core.dtypes.missing import array_equivalent - -import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - DatetimeIndex, - Index, - IntervalIndex, - MultiIndex, - RangeIndex, - Series, - bdate_range, -) -from pandas.core.algorithms import take_1d -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - IntervalArray, - PeriodArray, - TimedeltaArray, - period_array, -) - -from pandas.io.common import urlopen -from pandas.io.formats.printing import pprint_thing - -lzma = _import_lzma() - -N = 30 -K = 4 -_RAISE_NETWORK_ERROR_DEFAULT = False - -# set testing_mode -_testing_mode_warnings = (DeprecationWarning, ResourceWarning) - - -def set_testing_mode(): - # set the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("always", _testing_mode_warnings) - - -def reset_testing_mode(): - # reset the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("ignore", _testing_mode_warnings) - - -set_testing_mode() - - -def reset_display_options(): - """ - Reset the display options for printing and representing objects. - """ - - pd.reset_option("^display.", silent=True) - - -def round_trip_pickle(obj, path=None): - """ - Pickle an object and then read it again. - - Parameters - ---------- - obj : pandas object - The object to pickle and then re-read. - path : str, default None - The path where the pickled object is written and then read. - - Returns - ------- - round_trip_pickled_object : pandas object - The original object that was pickled and then re-read. - """ - - if path is None: - path = "__{random_bytes}__.pickle".format(random_bytes=rands(10)) - with ensure_clean(path) as path: - pd.to_pickle(obj, path) - return pd.read_pickle(path) - - -def round_trip_pathlib(writer, reader, path=None): - """ - Write an object to file specified by a pathlib.Path and read it back - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - round_trip_object : pandas object - The original object that was serialized and then re-read. - """ - - import pytest - - Path = pytest.importorskip("pathlib").Path - if path is None: - path = "___pathlib___" - with ensure_clean(path) as path: - writer(Path(path)) - obj = reader(Path(path)) - return obj - - -def round_trip_localpath(writer, reader, path=None): - """ - Write an object to file specified by a py.path LocalPath and read it back - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - round_trip_object : pandas object - The original object that was serialized and then re-read. - """ - import pytest - - LocalPath = pytest.importorskip("py.path").local - if path is None: - path = "___localpath___" - with ensure_clean(path) as path: - writer(LocalPath(path)) - obj = reader(LocalPath(path)) - return obj - - -@contextmanager -def decompress_file(path, compression): - """ - Open a compressed file and return a file object - - Parameters - ---------- - path : str - The path where the file is read from - - compression : {'gzip', 'bz2', 'zip', 'xz', None} - Name of the decompression to use - - Returns - ------- - f : file object - """ - - if compression is None: - f = open(path, "rb") - elif compression == "gzip": - f = gzip.open(path, "rb") - elif compression == "bz2": - f = bz2.BZ2File(path, "rb") - elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") - elif compression == "zip": - zip_file = zipfile.ZipFile(path) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - else: - raise ValueError("ZIP file {} error. Only one file per ZIP.".format(path)) - else: - msg = "Unrecognized compression type: {}".format(compression) - raise ValueError(msg) - - try: - yield f - finally: - f.close() - if compression == "zip": - zip_file.close() - - -def write_to_compressed(compression, path, data, dest="test"): - """ - Write data to a compressed file. - - Parameters - ---------- - compression : {'gzip', 'bz2', 'zip', 'xz'} - The compression type to use. - path : str - The file path to write the data. - data : str - The data to write. - dest : str, default "test" - The destination file (for ZIP only) - - Raises - ------ - ValueError : An invalid compression value was passed in. - """ - - if compression == "zip": - import zipfile - - compress_method = zipfile.ZipFile - elif compression == "gzip": - import gzip - - compress_method = gzip.GzipFile - elif compression == "bz2": - import bz2 - - compress_method = bz2.BZ2File - elif compression == "xz": - compress_method = _get_lzma_file(lzma) - else: - msg = "Unrecognized compression type: {}".format(compression) - raise ValueError(msg) - - if compression == "zip": - mode = "w" - args = (dest, data) - method = "writestr" - else: - mode = "wb" - args = (data,) - method = "write" - - with compress_method(path, mode=mode) as f: - getattr(f, method)(*args) - - -def assert_almost_equal( - left, right, check_dtype="equiv", check_less_precise=False, **kwargs -): - """ - Check that the left and right objects are approximately equal. - - By approximately equal, we refer to objects that are numbers or that - contain numbers which may be equivalent to specific levels of precision. - - Parameters - ---------- - left : object - right : object - check_dtype : bool or {'equiv'}, default 'equiv' - Check dtype if both a and b are the same type. If 'equiv' is passed in, - then `RangeIndex` and `Int64Index` are also considered equivalent - when doing type checking. - check_less_precise : bool or int, default False - Specify comparison precision. 5 digits (False) or 3 digits (True) - after decimal points are compared. If int, then specify the number - of digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - """ - - if isinstance(left, pd.Index): - assert_index_equal( - left, - right, - check_exact=False, - exact=check_dtype, - check_less_precise=check_less_precise, - **kwargs - ) - - elif isinstance(left, pd.Series): - assert_series_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs - ) - - elif isinstance(left, pd.DataFrame): - assert_frame_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs - ) - - else: - # Other sequences. - if check_dtype: - if is_number(left) and is_number(right): - # Do not compare numeric classes, like np.float64 and float. - pass - elif is_bool(left) and is_bool(right): - # Do not compare bool classes, like np.bool_ and bool. - pass - else: - if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): - obj = "numpy array" - else: - obj = "Input" - assert_class_equal(left, right, obj=obj) - _testing.assert_almost_equal( - left, - right, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs - ) - - -def _check_isinstance(left, right, cls): - """ - Helper method for our assert_* methods that ensures that - the two objects being compared have the right type before - proceeding with the comparison. - - Parameters - ---------- - left : The first object being compared. - right : The second object being compared. - cls : The class type to check against. - - Raises - ------ - AssertionError : Either `left` or `right` is not an instance of `cls`. - """ - - err_msg = "{name} Expected type {exp_type}, found {act_type} instead" - cls_name = cls.__name__ - - if not isinstance(left, cls): - raise AssertionError( - err_msg.format(name=cls_name, exp_type=cls, act_type=type(left)) - ) - if not isinstance(right, cls): - raise AssertionError( - err_msg.format(name=cls_name, exp_type=cls, act_type=type(right)) - ) - - -def assert_dict_equal(left, right, compare_keys=True): - - _check_isinstance(left, right, dict) - _testing.assert_dict_equal(left, right, compare_keys=compare_keys) - - -def randbool(size=(), p=0.5): - return rand(*size) <= p - - -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) -RANDU_CHARS = np.array( - list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), - dtype=(np.unicode_, 1), -) - - -def rands_array(nchars, size, dtype="O"): - """Generate an array of byte strings.""" - retval = ( - np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) - .view((np.str_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def randu_array(nchars, size, dtype="O"): - """Generate an array of unicode strings.""" - retval = ( - np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def rands(nchars): - """ - Generate one random byte string. - - See `rands_array` if you want to create an array of random strings. - - """ - return "".join(np.random.choice(RANDS_CHARS, nchars)) - - -def randu(nchars): - """ - Generate one random unicode string. - - See `randu_array` if you want to create an array of random unicode strings. - - """ - return "".join(np.random.choice(RANDU_CHARS, nchars)) - - -def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close - - if fignum is None: - for fignum in get_fignums(): - _close(fignum) - else: - _close(fignum) - - -# ----------------------------------------------------------------------------- -# contextmanager to ensure the file cleanup - - -@contextmanager -def ensure_clean(filename=None, return_filelike=False): - """Gets a temporary path and agrees to remove on close. - - Parameters - ---------- - filename : str (optional) - if None, creates a temporary file which is then removed when out of - scope. if passed, creates temporary file with filename as ending. - return_filelike : bool (default False) - if True, returns a file-like which is *always* cleaned. Necessary for - savefig and other functions which want to append extensions. - """ - filename = filename or "" - fd = None - - if return_filelike: - f = tempfile.TemporaryFile(suffix=filename) - try: - yield f - finally: - f.close() - else: - # don't generate tempfile if using a path with directory specified - if len(os.path.dirname(filename)): - raise ValueError("Can't pass a qualified name to ensure_clean()") - - try: - fd, filename = tempfile.mkstemp(suffix=filename) - except UnicodeEncodeError: - import pytest - - pytest.skip("no unicode file names on this system") - - try: - yield filename - finally: - try: - os.close(fd) - except OSError: - print( - "Couldn't close file descriptor: {fdesc} (file: {fname})".format( - fdesc=fd, fname=filename - ) - ) - try: - if os.path.exists(filename): - os.remove(filename) - except OSError as e: - print("Exception on removing file: {error}".format(error=e)) - - -@contextmanager -def ensure_clean_dir(): - """ - Get a temporary directory path and agrees to remove on close. - - Yields - ------ - Temporary directory path - """ - directory_name = tempfile.mkdtemp(suffix="") - try: - yield directory_name - finally: - try: - rmtree(directory_name) - except OSError: - pass - - -@contextmanager -def ensure_safe_environment_variables(): - """ - Get a context manager to safely set environment variables - - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. - """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) - - -# ----------------------------------------------------------------------------- -# Comparators - - -def equalContents(arr1, arr2): - """Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - -def assert_index_equal( - left: Index, - right: Index, - exact: Union[bool, str] = "equiv", - check_names: bool = True, - check_less_precise: Union[bool, int] = False, - check_exact: bool = True, - check_categorical: bool = True, - obj: str = "Index", -) -> None: - """ - Check that left and right Index are equal. - - Parameters - ---------- - left : Index - right : Index - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - check_names : bool, default True - Whether to check the names attribute. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare - check_exact : bool, default True - Whether to compare number exactly. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Index' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - def _check_types(l, r, obj="Index"): - if exact: - assert_class_equal(l, r, exact=exact, obj=obj) - - # Skip exact dtype checking when `check_categorical` is False - if check_categorical: - assert_attr_equal("dtype", l, r, obj=obj) - - # allow string-like to have different inferred_types - if l.inferred_type in ("string", "unicode"): - assert r.inferred_type in ("string", "unicode") - else: - assert_attr_equal("inferred_type", l, r, obj=obj) - - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - labels = index.codes[level] - filled = take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._shallow_copy(filled, name=index.names[level]) - return values - - # instance validation - _check_isinstance(left, right, Index) - - # class / dtype comparison - _check_types(left, right, obj=obj) - - # level comparison - if left.nlevels != right.nlevels: - msg1 = "{obj} levels are different".format(obj=obj) - msg2 = "{nlevels}, {left}".format(nlevels=left.nlevels, left=left) - msg3 = "{nlevels}, {right}".format(nlevels=right.nlevels, right=right) - raise_assert_detail(obj, msg1, msg2, msg3) - - # length comparison - if len(left) != len(right): - msg1 = "{obj} length are different".format(obj=obj) - msg2 = "{length}, {left}".format(length=len(left), left=left) - msg3 = "{length}, {right}".format(length=len(right), right=right) - raise_assert_detail(obj, msg1, msg2, msg3) - - # MultiIndex special comparison for little-friendly error messages - if left.nlevels > 1: - left = cast(MultiIndex, left) - right = cast(MultiIndex, right) - - for level in range(left.nlevels): - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) - - lobj = "MultiIndex level [{level}]".format(level=level) - assert_index_equal( - llevel, - rlevel, - exact=exact, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - obj=lobj, - ) - # get_level_values may change dtype - _check_types(left.levels[level], right.levels[level], obj=obj) - - # skip exact index checking when `check_categorical` is False - if check_exact and check_categorical: - if not left.equals(right): - diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) - msg = "{obj} values are different ({pct} %)".format( - obj=obj, pct=np.round(diff, 5) - ) - raise_assert_detail(obj, msg, left, right) - else: - _testing.assert_almost_equal( - left.values, - right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, - lobj=left, - robj=right, - ) - - # metadata comparison - if check_names: - assert_attr_equal("names", left, right, obj=obj) - if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): - assert_attr_equal("freq", left, right, obj=obj) - if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): - assert_interval_array_equal(left.values, right.values) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal( - left.values, right.values, obj="{obj} category".format(obj=obj) - ) - - -def assert_class_equal(left, right, exact=True, obj="Input"): - """checks classes are equal.""" - __tracebackhide__ = True - - def repr_class(x): - if isinstance(x, Index): - # return Index as it is to include values in the error message - return x - - try: - return x.__class__.__name__ - except AttributeError: - return repr(type(x)) - - if exact == "equiv": - if type(left) != type(right): - # allow equivalence of Int64Index/RangeIndex - types = {type(left).__name__, type(right).__name__} - if len(types - {"Int64Index", "RangeIndex"}): - msg = "{obj} classes are not equivalent".format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - elif exact: - if type(left) != type(right): - msg = "{obj} classes are different".format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - - -def assert_attr_equal(attr, left, right, obj="Attributes"): - """checks attributes are equal. Both objects must have attribute. - - Parameters - ---------- - attr : str - Attribute name being compared. - left : object - right : object - obj : str, default 'Attributes' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - left_attr = getattr(left, attr) - right_attr = getattr(right, attr) - - if left_attr is right_attr: - return True - elif ( - is_number(left_attr) - and np.isnan(left_attr) - and is_number(right_attr) - and np.isnan(right_attr) - ): - # np.nan - return True - - try: - result = left_attr == right_attr - except TypeError: - # datetimetz on rhs may raise TypeError - result = False - if not isinstance(result, bool): - result = result.all() - - if result: - return True - else: - msg = 'Attribute "{attr}" are different'.format(attr=attr) - raise_assert_detail(obj, msg, left_attr, right_attr) - - -def assert_is_valid_plot_return_object(objs): - import matplotlib.pyplot as plt - - if isinstance(objs, (pd.Series, np.ndarray)): - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, type " - "encountered {name!r}" - ).format(name=el.__class__.__name__) - assert isinstance(el, (plt.Axes, dict)), msg - else: - assert isinstance(objs, (plt.Artist, tuple, dict)), ( - "objs is neither an ndarray of Artist instances nor a " - 'single Artist instance, tuple, or dict, "objs" is a {name!r}'.format( - name=objs.__class__.__name__ - ) - ) - - -def isiterable(obj): - return hasattr(obj, "__iter__") - - -def assert_is_sorted(seq): - """Assert that the sequence is sorted.""" - if isinstance(seq, (Index, Series)): - seq = seq.values - # sorting does not change precisions - assert_numpy_array_equal(seq, np.sort(np.array(seq))) - - -def assert_categorical_equal( - left, right, check_dtype=True, check_category_order=True, obj="Categorical" -): - """Test that Categoricals are equivalent. - - Parameters - ---------- - left : Categorical - right : Categorical - check_dtype : bool, default True - Check that integer dtype of the codes are the same - check_category_order : bool, default True - Whether the order of the categories should be compared, which - implies identical integer codes. If False, only the resulting - values are compared. The ordered attribute is - checked regardless. - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, Categorical) - - if check_category_order: - assert_index_equal( - left.categories, right.categories, obj="{obj}.categories".format(obj=obj) - ) - assert_numpy_array_equal( - left.codes, - right.codes, - check_dtype=check_dtype, - obj="{obj}.codes".format(obj=obj), - ) - else: - assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj="{obj}.categories".format(obj=obj), - ) - assert_index_equal( - left.categories.take(left.codes), - right.categories.take(right.codes), - obj="{obj}.values".format(obj=obj), - ) - - assert_attr_equal("ordered", left, right, obj=obj) - - -def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): - """Test that two IntervalArrays are equivalent. - - Parameters - ---------- - left, right : IntervalArray - The IntervalArrays to compare. - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - obj : str, default 'IntervalArray' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, IntervalArray) - - assert_index_equal( - left.left, right.left, exact=exact, obj="{obj}.left".format(obj=obj) - ) - assert_index_equal( - left.right, right.right, exact=exact, obj="{obj}.left".format(obj=obj) - ) - assert_attr_equal("closed", left, right, obj=obj) - - -def assert_period_array_equal(left, right, obj="PeriodArray"): - _check_isinstance(left, right, PeriodArray) - - assert_numpy_array_equal( - left._data, right._data, obj="{obj}.values".format(obj=obj) - ) - assert_attr_equal("freq", left, right, obj=obj) - - -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): - __tracebackhide__ = True - _check_isinstance(left, right, DatetimeArray) - - assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) - assert_attr_equal("freq", left, right, obj=obj) - assert_attr_equal("tz", left, right, obj=obj) - - -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): - __tracebackhide__ = True - _check_isinstance(left, right, TimedeltaArray) - assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) - assert_attr_equal("freq", left, right, obj=obj) - - -def raise_assert_detail(obj, message, left, right, diff=None): - __tracebackhide__ = True - - if isinstance(left, np.ndarray): - left = pprint_thing(left) - elif is_categorical_dtype(left): - left = repr(left) - - if isinstance(right, np.ndarray): - right = pprint_thing(right) - elif is_categorical_dtype(right): - right = repr(right) - - msg = """{obj} are different - -{message} -[left]: {left} -[right]: {right}""".format( - obj=obj, message=message, left=left, right=right - ) - - if diff is not None: - msg += "\n[diff]: {diff}".format(diff=diff) - - raise AssertionError(msg) - - -def assert_numpy_array_equal( - left, - right, - strict_nan=False, - check_dtype=True, - err_msg=None, - check_same=None, - obj="numpy array", -): - """ Checks that 'np.ndarray' is equivalent - - Parameters - ---------- - left : np.ndarray or iterable - right : np.ndarray or iterable - strict_nan : bool, default False - If True, consider NaN and None to be different. - check_dtype: bool, default True - check dtype if both a and b are np.ndarray - err_msg : str, default None - If provided, used as assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area - obj : str, default 'numpy array' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - # instance validation - # Show a detailed error message when classes are different - assert_class_equal(left, right, obj=obj) - # both classes must be an np.ndarray - _check_isinstance(left, right, np.ndarray) - - def _get_base(obj): - return obj.base if getattr(obj, "base", None) is not None else obj - - left_base = _get_base(left) - right_base = _get_base(right) - - if check_same == "same": - if left_base is not right_base: - msg = "{left!r} is not {right!r}".format(left=left_base, right=right_base) - raise AssertionError(msg) - elif check_same == "copy": - if left_base is right_base: - msg = "{left!r} is {right!r}".format(left=left_base, right=right_base) - raise AssertionError(msg) - - def _raise(left, right, err_msg): - if err_msg is None: - if left.shape != right.shape: - raise_assert_detail( - obj, - "{obj} shapes are different".format(obj=obj), - left.shape, - right.shape, - ) - - diff = 0 - for l, r in zip(left, right): - # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): - diff += 1 - - diff = diff * 100.0 / left.size - msg = "{obj} values are different ({pct} %)".format( - obj=obj, pct=np.round(diff, 5) - ) - raise_assert_detail(obj, msg, left, right) - - raise AssertionError(err_msg) - - # compare shape and values - if not array_equivalent(left, right, strict_nan=strict_nan): - _raise(left, right, err_msg) - - if check_dtype: - if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - assert_attr_equal("dtype", left, right, obj=obj) - - -def assert_extension_array_equal( - left, right, check_dtype=True, check_less_precise=False, check_exact=False -): - """Check that left and right ExtensionArrays are equal. - - Parameters - ---------- - left, right : ExtensionArray - The two arrays to compare - check_dtype : bool, default True - Whether to check if the ExtensionArray dtypes are identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - check_exact : bool, default False - Whether to compare number exactly. - - Notes - ----- - Missing values are checked separately from valid values. - A mask of missing values is computed for each and checked to match. - The remaining all-valid values are cast to object dtype and checked. - """ - assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" - assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" - if check_dtype: - assert_attr_equal("dtype", left, right, obj="ExtensionArray") - - if hasattr(left, "asi8") and type(right) == type(left): - # Avoid slow object-dtype comparisons - assert_numpy_array_equal(left.asi8, right.asi8) - return - - left_na = np.asarray(left.isna()) - right_na = np.asarray(right.isna()) - assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") - - left_valid = np.asarray(left[~left_na].astype(object)) - right_valid = np.asarray(right[~right_na].astype(object)) - if check_exact: - assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") - else: - _testing.assert_almost_equal( - left_valid, - right_valid, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - obj="ExtensionArray", - ) - - -# This could be refactored to use the NDFrame.equals method -def assert_series_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_series_type=True, - check_less_precise=False, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - obj="Series", -): - """ - Check that left and right Series are equal. - - Parameters - ---------- - left : Series - right : Series - check_dtype : bool, default True - Whether to check the Series dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_series_type : bool, default True - Whether to check the Series class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check the Series and Index names attribute. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Series' - Specify object name being compared, internally used to show appropriate - assertion message. - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, Series) - - if check_series_type: - # ToDo: There are some tests using rhs is sparse - # lhs is dense. Should use assert_class_equal in future - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # length comparison - if len(left) != len(right): - msg1 = "{len}, {left}".format(len=len(left), left=left.index) - msg2 = "{len}, {right}".format(len=len(right), right=right.index) - raise_assert_detail(obj, "Series length are different", msg1, msg2) - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj="{obj}.index".format(obj=obj), - ) - - if check_dtype: - # We want to skip exact dtype checking when `check_categorical` - # is False. We'll still raise if only one is a `Categorical`, - # regardless of `check_categorical` - if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) - and not check_categorical - ): - pass - else: - assert_attr_equal( - "dtype", left, right, obj="Attributes of {obj}".format(obj=obj) - ) - - if check_exact: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - obj="{obj}".format(obj=obj), - ) - elif check_datetimelike_compat: - # we want to check only if we have compat dtypes - # e.g. integer and M|m are NOT compat, but we can simply check - # the values in that case - if needs_i8_conversion(left) or needs_i8_conversion(right): - - # datetimelike may have different objects (e.g. datetime.datetime - # vs Timestamp) but will compare equal - if not Index(left.values).equals(Index(right.values)): - msg = ( - "[datetimelike_compat=True] {left} is not equal to {right}." - ).format(left=left.values, right=right.values) - raise AssertionError(msg) - else: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - ) - elif is_interval_dtype(left) or is_interval_dtype(right): - assert_interval_array_equal(left.array, right.array) - elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): - # .values is an ndarray, but ._values is the ExtensionArray. - # TODO: Use .array - assert is_extension_array_dtype(right.dtype) - assert_extension_array_equal(left._values, right._values) - elif ( - is_extension_array_dtype(left) - and not is_categorical_dtype(left) - and is_extension_array_dtype(right) - and not is_categorical_dtype(right) - ): - assert_extension_array_equal(left.array, right.array) - else: - _testing.assert_almost_equal( - left._internal_get_values(), - right._internal_get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj="{obj}".format(obj=obj), - ) - - # metadata comparison - if check_names: - assert_attr_equal("name", left, right, obj=obj) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal( - left.values, right.values, obj="{obj} category".format(obj=obj) - ) - - -# This could be refactored to use the NDFrame.equals method -def assert_frame_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_column_type="equiv", - check_frame_type=True, - check_less_precise=False, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - obj="DataFrame", -): - """ - Check that left and right DataFrame are equal. - - This function is intended to compare two DataFrames and output any - differences. Is is mostly intended for use in unit tests. - Additional parameters allow varying the strictness of the - equality checks performed. - - Parameters - ---------- - left : DataFrame - First DataFrame to compare. - right : DataFrame - Second DataFrame to compare. - check_dtype : bool, default True - Whether to check the DataFrame dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_column_type : bool or {'equiv'}, default 'equiv' - Whether to check the columns class, dtype and inferred_type - are identical. Is passed as the ``exact`` argument of - :func:`assert_index_equal`. - check_frame_type : bool, default True - Whether to check the DataFrame class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check that the `names` attribute for both the `index` - and `column` attributes of the DataFrame is identical, i.e. - - * left.index.names == right.index.names - * left.columns.names == right.columns.names - by_blocks : bool, default False - Specify how to compare internal data. If False, compare by columns. - If True, compare by blocks. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_like : bool, default False - If True, ignore the order of index & columns. - Note: index labels must match their respective rows - (same as in columns) - same labels must be with the same data. - obj : str, default 'DataFrame' - Specify object name being compared, internally used to show appropriate - assertion message. - - See Also - -------- - assert_series_equal : Equivalent method for asserting Series equality. - DataFrame.equals : Check DataFrame equality. - - Examples - -------- - This example shows comparing two DataFrames that are equal - but with columns of differing dtypes. - - >>> from pandas.util.testing import assert_frame_equal - >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) - - df1 equals itself. - - >>> assert_frame_equal(df1, df1) - - df1 differs from df2 as column 'b' is of a different type. - - >>> assert_frame_equal(df1, df2) - Traceback (most recent call last): - ... - AssertionError: Attributes of DataFrame.iloc[:, 1] are different - - Attribute "dtype" are different - [left]: int64 - [right]: float64 - - Ignore differing dtypes in columns with check_dtype. - - >>> assert_frame_equal(df1, df2, check_dtype=False) - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, DataFrame) - - if check_frame_type: - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # shape comparison - if left.shape != right.shape: - raise_assert_detail( - obj, - "{obj} shape mismatch".format(obj=obj), - "{shape!r}".format(shape=left.shape), - "{shape!r}".format(shape=right.shape), - ) - - if check_like: - left, right = left.reindex_like(right), right - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj="{obj}.index".format(obj=obj), - ) - - # column comparison - assert_index_equal( - left.columns, - right.columns, - exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj="{obj}.columns".format(obj=obj), - ) - - # compare by blocks - if by_blocks: - rblocks = right._to_dict_of_blocks() - lblocks = left._to_dict_of_blocks() - for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): - assert dtype in lblocks - assert dtype in rblocks - assert_frame_equal( - lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj - ) - - # compare by columns - else: - for i, col in enumerate(left.columns): - assert col in right - lcol = left.iloc[:, i] - rcol = right.iloc[:, i] - assert_series_equal( - lcol, - rcol, - check_dtype=check_dtype, - check_index_type=check_index_type, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_names=check_names, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - obj="{obj}.iloc[:, {idx}]".format(obj=obj, idx=i), - ) - - -def assert_equal(left, right, **kwargs): - """ - Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. - - Parameters - ---------- - left : Index, Series, DataFrame, ExtensionArray, or np.ndarray - right : Index, Series, DataFrame, ExtensionArray, or np.ndarray - **kwargs - """ - __tracebackhide__ = True - - if isinstance(left, pd.Index): - assert_index_equal(left, right, **kwargs) - elif isinstance(left, pd.Series): - assert_series_equal(left, right, **kwargs) - elif isinstance(left, pd.DataFrame): - assert_frame_equal(left, right, **kwargs) - elif isinstance(left, IntervalArray): - assert_interval_array_equal(left, right, **kwargs) - elif isinstance(left, PeriodArray): - assert_period_array_equal(left, right, **kwargs) - elif isinstance(left, DatetimeArray): - assert_datetime_array_equal(left, right, **kwargs) - elif isinstance(left, TimedeltaArray): - assert_timedelta_array_equal(left, right, **kwargs) - elif isinstance(left, ExtensionArray): - assert_extension_array_equal(left, right, **kwargs) - elif isinstance(left, np.ndarray): - assert_numpy_array_equal(left, right, **kwargs) - elif isinstance(left, str): - assert kwargs == {} - return left == right - else: - raise NotImplementedError(type(left)) - - -def box_expected(expected, box_cls, transpose=True): - """ - Helper function to wrap the expected output of a test in a given box_class. - - Parameters - ---------- - expected : np.ndarray, Index, Series - box_cls : {Index, Series, DataFrame} - - Returns - ------- - subclass of box_cls - """ - if box_cls is pd.Index: - expected = pd.Index(expected) - elif box_cls is pd.Series: - expected = pd.Series(expected) - elif box_cls is pd.DataFrame: - expected = pd.Series(expected).to_frame() - if transpose: - # for vector operations, we we need a DataFrame to be a single-row, - # not a single-column, in order to operate against non-DataFrame - # vectors of the same length. - expected = expected.T - elif box_cls is PeriodArray: - # the PeriodArray constructor is not as flexible as period_array - expected = period_array(expected) - elif box_cls is DatetimeArray: - expected = DatetimeArray(expected) - elif box_cls is TimedeltaArray: - expected = TimedeltaArray(expected) - elif box_cls is np.ndarray: - expected = np.array(expected) - elif box_cls is to_array: - expected = to_array(expected) - else: - raise NotImplementedError(box_cls) - return expected - - -def to_array(obj): - # temporary implementation until we get pd.array in place - if is_period_dtype(obj): - return period_array(obj) - elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): - return DatetimeArray._from_sequence(obj) - elif is_timedelta64_dtype(obj): - return TimedeltaArray._from_sequence(obj) - else: - return np.array(obj) - - -# ----------------------------------------------------------------------------- -# Sparse - - -def assert_sp_array_equal( - left, - right, - check_dtype=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, -): - """Check that the left and right SparseArray are equal. - - Parameters - ---------- - left : SparseArray - right : SparseArray - check_dtype : bool, default True - Whether to check the data dtype is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. - """ - - _check_isinstance(left, right, pd.SparseArray) - - assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) - - # SparseIndex comparison - assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) - assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) - - if not check_kind: - left_index = left.sp_index.to_block_index() - right_index = right.sp_index.to_block_index() - else: - left_index = left.sp_index - right_index = right.sp_index - - if consolidate_block_indices and left.kind == "block": - # we'll probably remove this hack... - left_index = left_index.to_int_index().to_block_index() - right_index = right_index.to_int_index().to_block_index() - - if not left_index.equals(right_index): - raise_assert_detail( - "SparseArray.index", "index are not equal", left_index, right_index - ) - else: - # Just ensure a - pass - - if check_fill_value: - assert_attr_equal("fill_value", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) - - -# ----------------------------------------------------------------------------- -# Others - - -def assert_contains_all(iterable, dic): - for k in iterable: - assert k in dic, "Did not contain item: '{key!r}'".format(key=k) - - -def assert_copy(iter1, iter2, **eql_kwargs): - """ - iter1, iter2: iterables that produce elements - comparable with assert_almost_equal - - Checks that the elements are equal, but not - the same object. (Does not check that items - in sequences are also not the same object) - """ - for elem1, elem2 in zip(iter1, iter2): - assert_almost_equal(elem1, elem2, **eql_kwargs) - msg = ( - "Expected object {obj1!r} and object {obj2!r} to be " - "different objects, but they were the same object." - ).format(obj1=type(elem1), obj2=type(elem2)) - assert elem1 is not elem2, msg - - -def getCols(k): - return string.ascii_uppercase[:k] - - -# make index -def makeStringIndex(k=10, name=None): - return Index(rands_array(nchars=10, size=k), name=name) - - -def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k), name=name) - - -def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): - """ make a length k index or n categories """ - x = rands_array(nchars=4, size=n) - return CategoricalIndex( - Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs - ) - - -def makeIntervalIndex(k=10, name=None, **kwargs): - """ make a length k IntervalIndex """ - x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name, **kwargs) - - -def makeBoolIndex(k=10, name=None): - if k == 1: - return Index([True], name=name) - elif k == 2: - return Index([False, True], name=name) - return Index([False, True] + [False] * (k - 2), name=name) - - -def makeIntIndex(k=10, name=None): - return Index(list(range(k)), name=name) - - -def makeUIntIndex(k=10, name=None): - return Index([2 ** 63 + i for i in range(k)], name=name) - - -def makeRangeIndex(k=10, name=None, **kwargs): - return RangeIndex(0, k, 1, name=name, **kwargs) - - -def makeFloatIndex(k=10, name=None): - values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) - return Index(values * (10 ** np.random.randint(0, 9)), name=name) - - -def makeDateIndex(k=10, freq="B", name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k, freq=freq, name=name) - return DatetimeIndex(dr, name=name, **kwargs) - - -def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): - return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) - - -def makePeriodIndex(k=10, name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) - return dr - - -def makeMultiIndex(k=10, names=None, **kwargs): - return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) - - -_names = [ - "Alice", - "Bob", - "Charlie", - "Dan", - "Edith", - "Frank", - "George", - "Hannah", - "Ingrid", - "Jerry", - "Kevin", - "Laura", - "Michael", - "Norbert", - "Oliver", - "Patricia", - "Quinn", - "Ray", - "Sarah", - "Tim", - "Ursula", - "Victor", - "Wendy", - "Xavier", - "Yvonne", - "Zelda", -] - - -def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): - """ - Make a DataFrame with a DatetimeIndex - - Parameters - ---------- - start : str or Timestamp, default "2000-01-01" - The start of the index. Passed to date_range with `freq`. - end : str or Timestamp, default "2000-12-31" - The end of the index. Passed to date_range with `freq`. - freq : str or Freq - The frequency to use for the DatetimeIndex - seed : int, optional - The random state seed. - - * name : object dtype with string names - * id : int dtype with - * x, y : float dtype - - Examples - -------- - >>> _make_timeseries() - id name x y - timestamp - 2000-01-01 982 Frank 0.031261 0.986727 - 2000-01-02 1025 Edith -0.086358 -0.032920 - 2000-01-03 982 Edith 0.473177 0.298654 - 2000-01-04 1009 Sarah 0.534344 -0.750377 - 2000-01-05 963 Zelda -0.271573 0.054424 - ... ... ... ... ... - 2000-12-27 980 Ingrid -0.132333 -0.422195 - 2000-12-28 972 Frank -0.376007 -0.298687 - 2000-12-29 1009 Ursula -0.865047 -0.503133 - 2000-12-30 1000 Hannah -0.063757 -0.507336 - 2000-12-31 972 Tim -0.869120 0.531685 - """ - index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") - n = len(index) - state = np.random.RandomState(seed) - columns = { - "name": state.choice(_names, size=n), - "id": state.poisson(1000, size=n), - "x": state.rand(n) * 2 - 1, - "y": state.rand(n) * 2 - 1, - } - df = pd.DataFrame(columns, index=index, columns=sorted(columns)) - if df.index[-1] == end: - df = df.iloc[:-1] - return df - - -def all_index_generator(k=10): - """Generator which can be iterated over to get instances of all the various - index classes. - - Parameters - ---------- - k: length of each of the index instances - """ - all_make_index_funcs = [ - makeIntIndex, - makeFloatIndex, - makeStringIndex, - makeUnicodeIndex, - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeBoolIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - ] - for make_index_func in all_make_index_funcs: - yield make_index_func(k=k) - - -def index_subclass_makers_generator(): - make_index_funcs = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - makeMultiIndex, - ] - for make_index_func in make_index_funcs: - yield make_index_func - - -def all_timeseries_index_generator(k=10): - """Generator which can be iterated over to get instances of all the classes - which represent time-series. - - Parameters - ---------- - k: length of each of the index instances - """ - make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] - for make_index_func in make_index_funcs: - yield make_index_func(k=k) - - -# make series -def makeFloatSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeStringSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeObjectSeries(name=None): - data = makeStringIndex(N) - data = Index(data, dtype=object) - index = makeStringIndex(N) - return Series(data, index=index, name=name) - - -def getSeriesData(): - index = makeStringIndex(N) - return {c: Series(randn(N), index=index) for c in getCols(K)} - - -def makeTimeSeries(nper=None, freq="B", name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) - - -def makePeriodSeries(nper=None, name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makePeriodIndex(nper), name=name) - - -def getTimeSeriesData(nper=None, freq="B"): - return {c: makeTimeSeries(nper, freq) for c in getCols(K)} - - -def getPeriodData(nper=None): - return {c: makePeriodSeries(nper) for c in getCols(K)} - - -# make frame -def makeTimeDataFrame(nper=None, freq="B"): - data = getTimeSeriesData(nper, freq) - return DataFrame(data) - - -def makeDataFrame(): - data = getSeriesData() - return DataFrame(data) - - -def getMixedTypeDict(): - index = Index(["a", "b", "c", "d", "e"]) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, 4.0], - "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": bdate_range("1/1/2009", periods=5), - } - - return index, data - - -def makeMixedDataFrame(): - return DataFrame(getMixedTypeDict()[1]) - - -def makePeriodFrame(nper=None): - data = getPeriodData(nper) - return DataFrame(data) - - -def makeCustomIndex( - nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None -): - """Create an index/multindex with given dimensions, levels, names, etc' - - nentries - number of entries in index - nlevels - number of levels (> 1 produces multindex) - prefix - a string prefix for labels - names - (Optional), bool or list of strings. if True will use default - names, if false will use no names, if a list is given, the name of - each level in the index will be taken from the list. - ndupe_l - (Optional), list of ints, the number of rows for which the - label will repeated at the corresponding level, you can specify just - the first few, the rest will use the default ndupe_l of 1. - len(ndupe_l) <= nlevels. - idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a datetime index. - - if unspecified, string labels will be generated. - """ - - if ndupe_l is None: - ndupe_l = [1] * nlevels - assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels - assert names is None or names is False or names is True or len(names) is nlevels - assert idx_type is None or ( - idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 - ) - - if names is True: - # build default names - names = [prefix + str(i) for i in range(nlevels)] - if names is False: - # pass None to index constructor for no name - names = None - - # make singleton case uniform - if isinstance(names, str) and nlevels == 1: - names = [names] - - # specific 1D index type requested? - idx_func = dict( - i=makeIntIndex, - f=makeFloatIndex, - s=makeStringIndex, - u=makeUnicodeIndex, - dt=makeDateIndex, - td=makeTimedeltaIndex, - p=makePeriodIndex, - ).get(idx_type) - if idx_func: - idx = idx_func(nentries) - # but we need to fill in the name - if names: - idx.name = names[0] - return idx - elif idx_type is not None: - raise ValueError( - '"{idx_type}" is not a legal value for `idx_type`, ' - 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".'.format(idx_type=idx_type) - ) - - if len(ndupe_l) < nlevels: - ndupe_l.extend([1] * (nlevels - len(ndupe_l))) - assert len(ndupe_l) == nlevels - - assert all(x > 0 for x in ndupe_l) - - tuples = [] - for i in range(nlevels): - - def keyfunc(x): - import re - - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") - return [int(num) for num in numeric_tuple] - - # build a list of lists to create the index from - div_factor = nentries // ndupe_l[i] + 1 - cnt = Counter() - for j in range(div_factor): - label = "{prefix}_l{i}_g{j}".format(prefix=prefix, i=i, j=j) - cnt[label] = ndupe_l[i] - # cute Counter trick - result = list(sorted(cnt.elements(), key=keyfunc))[:nentries] - tuples.append(result) - - tuples = list(zip(*tuples)) - - # convert tuples to index - if nentries == 1: - # we have a single level of tuples, i.e. a regular Index - index = Index(tuples[0], name=names[0]) - elif nlevels == 1: - name = None if names is None else names[0] - index = Index((x[0] for x in tuples), name=name) - else: - index = MultiIndex.from_tuples(tuples, names=names) - return index - - -def makeCustomDataframe( - nrows, - ncols, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - nrows, ncols - number of data rows/cols - c_idx_names, idx_names - False/True/list of strings, yields No names , - default names or uses the provided names for the levels of the - corresponding index. You can provide a single string when - c_idx_nlevels ==1. - c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex - r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex - data_gen_f - a function f(row,col) which return the data value - at that position, the default generator used yields values of the form - "RxCy" based on position. - c_ndupe_l, r_ndupe_l - list of integers, determines the number - of duplicates for each label at a given level of the corresponding - index. The default `None` value produces a multiplicity of 1 across - all levels, i.e. a unique index. Will accept a partial list of length - N < idx_nlevels, for just the first N levels. If ndupe doesn't divide - nrows/ncol, the last label might have lower multiplicity. - dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjunction with a custom `data_gen_f` - r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a timedelta index. - - if unspecified, string labels will be generated. - - Examples: - - # 5 row, 3 columns, default names on both, single index on both axis - >> makeCustomDataframe(5,3) - - # make the data a random int between 1 and 100 - >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) - - # 2-level multiindex on rows with each label duplicated - # twice on first level, default names on both axis, single - # index on both axis - >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) - - # DatetimeIndex on row, index with unicode labels on columns - # no names on either axis - >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, - r_idx_type="dt",c_idx_type="u") - - # 4-level multindex on rows with names provided, 2-level multindex - # on columns with default labels and default names. - >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, - r_idx_names=["FEE","FI","FO","FAM"], - c_idx_nlevels=2) - - >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - """ - - assert c_idx_nlevels > 0 - assert r_idx_nlevels > 0 - assert r_idx_type is None or ( - r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 - ) - assert c_idx_type is None or ( - c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 - ) - - columns = makeCustomIndex( - ncols, - nlevels=c_idx_nlevels, - prefix="C", - names=c_idx_names, - ndupe_l=c_ndupe_l, - idx_type=c_idx_type, - ) - index = makeCustomIndex( - nrows, - nlevels=r_idx_nlevels, - prefix="R", - names=r_idx_names, - ndupe_l=r_ndupe_l, - idx_type=r_idx_type, - ) - - # by default, generate data based on location - if data_gen_f is None: - data_gen_f = lambda r, c: "R{rows}C{cols}".format(rows=r, cols=c) - - data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] - - return DataFrame(data, index, columns, dtype=dtype) - - -def _create_missing_idx(nrows, ncols, density, random_state=None): - if random_state is None: - random_state = np.random - else: - random_state = np.random.RandomState(random_state) - - # below is cribbed from scipy.sparse - size = int(np.round((1 - density) * nrows * ncols)) - # generate a few more to ensure unique values - min_rows = 5 - fac = 1.02 - extra_size = min(size + min_rows, fac * size) - - def _gen_unique_rand(rng, _extra_size): - ind = rng.rand(int(_extra_size)) - return np.unique(np.floor(ind * nrows * ncols))[:size] - - ind = _gen_unique_rand(random_state, extra_size) - while ind.size < size: - extra_size *= 1.05 - ind = _gen_unique_rand(random_state, extra_size) - - j = np.floor(ind * 1.0 / nrows).astype(int) - i = (ind - j * nrows).astype(int) - return i.tolist(), j.tolist() - - -def makeMissingCustomDataframe( - nrows, - ncols, - density=0.9, - random_state=None, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - Parameters - ---------- - Density : float, optional - Float in (0, 1) that gives the percentage of non-missing numbers in - the DataFrame. - random_state : {np.random.RandomState, int}, optional - Random number generator or random seed. - - See makeCustomDataframe for descriptions of the rest of the parameters. - """ - df = makeCustomDataframe( - nrows, - ncols, - c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, - r_ndupe_l=r_ndupe_l, - dtype=dtype, - c_idx_type=c_idx_type, - r_idx_type=r_idx_type, - ) - - i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.values[i, j] = np.nan - return df - - -def makeMissingDataframe(density=0.9, random_state=None): - df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) - df.values[i, j] = np.nan - return df - - -class TestSubDict(dict): - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - - -def optional_args(decorator): - """allows a decorator to take optional positional and keyword arguments. - Assumes that taking a single, callable, positional argument means that - it is decorating a function, i.e. something like this:: - - @my_decorator - def function(): pass - - Calls decorator with decorator(f, *args, **kwargs)""" - - @wraps(decorator) - def wrapper(*args, **kwargs): - def dec(f): - return decorator(f, *args, **kwargs) - - is_decorating = not kwargs and len(args) == 1 and callable(args[0]) - if is_decorating: - f = args[0] - args = [] - return dec(f) - else: - return dec - - return wrapper - - -# skip tests on exceptions with this message -_network_error_messages = ( - # 'urlopen error timed out', - # 'timeout: timed out', - # 'socket.timeout: timed out', - "timed out", - "Server Hangup", - "HTTP Error 503: Service Unavailable", - "502: Proxy Error", - "HTTP Error 502: internal error", - "HTTP Error 502", - "HTTP Error 503", - "HTTP Error 403", - "HTTP Error 400", - "Temporary failure in name resolution", - "Name or service not known", - "Connection refused", - "certificate verify", -) - -# or this e.errno/e.reason.errno -_network_errno_vals = ( - 101, # Network is unreachable - 111, # Connection refused - 110, # Connection timed out - 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out -) - -# Both of the above shouldn't mask real issues such as 404's -# or refused connections (changed DNS). -# But some tests (test_data yahoo) contact incredibly flakey -# servers. - -# and conditionally raise on exception types in _get_default_network_errors - - -def _get_default_network_errors(): - # Lazy import for http.client because it imports many things from the stdlib - import http.client - - return (IOError, http.client.HTTPException, TimeoutError) - - -def can_connect(url, error_classes=None): - """Try to connect to the given url. True if succeeds, False if IOError - raised - - Parameters - ---------- - url : basestring - The URL to try to connect to - - Returns - ------- - connectable : bool - Return True if no IOError (unable to connect) or URLError (bad url) was - raised - """ - - if error_classes is None: - error_classes = _get_default_network_errors() - - try: - with urlopen(url): - pass - except error_classes: - return False - else: - return True - - -@optional_args -def network( - t, - url="http://www.google.com", - raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, - check_before_test=False, - error_classes=None, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, -): - """ - Label a test as requiring network connection and, if an error is - encountered, only raise if it does not find a network connection. - - In comparison to ``network``, this assumes an added contract to your test: - you must assert that, under normal conditions, your test will ONLY fail if - it does not have network connectivity. - - You can call this in 3 ways: as a standard decorator, with keyword - arguments, or with a positional argument that is the url to check. - - Parameters - ---------- - t : callable - The test requiring network connectivity. - url : path - The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'http://www.google.com'. - raise_on_error : bool - If True, never catches errors. - check_before_test : bool - If True, checks connectivity before running the test case. - error_classes : tuple or Exception - error classes to ignore. If not in ``error_classes``, raises the error. - defaults to IOError. Be careful about changing the error classes here. - skip_errnos : iterable of int - Any exception that has .errno or .reason.erno set to one - of these values will be skipped with an appropriate - message. - _skip_on_messages: iterable of string - any exception e for which one of the strings is - a substring of str(e) will be skipped with an appropriate - message. Intended to suppress errors where an errno isn't available. - - Notes - ----- - * ``raise_on_error`` supercedes ``check_before_test`` - - Returns - ------- - t : callable - The decorated test ``t``, with checks for connectivity errors. - - Example - ------- - - Tests decorated with @network will fail if it's possible to make a network - connection to another URL (defaults to google.com):: - - >>> from pandas.util.testing import network - >>> from pandas.io.common import urlopen - >>> @network - ... def test_network(): - ... with urlopen("rabbit://bonanza.com"): - ... pass - Traceback - ... - URLError: - - You can specify alternative URLs:: - - >>> @network("http://www.yahoo.com") - ... def test_something_with_yahoo(): - ... raise IOError("Failure Message") - >>> test_something_with_yahoo() - Traceback (most recent call last): - ... - IOError: Failure Message - - If you set check_before_test, it will check the url first and not run the - test on failure:: - - >>> @network("failing://url.blaher", check_before_test=True) - ... def test_something(): - ... print("I ran!") - ... raise ValueError("Failure") - >>> test_something() - Traceback (most recent call last): - ... - - Errors not related to networking will always be raised. - """ - from pytest import skip - - if error_classes is None: - error_classes = _get_default_network_errors() - - t.network = True - - @wraps(t) - def wrapper(*args, **kwargs): - if check_before_test and not raise_on_error: - if not can_connect(url, error_classes): - skip() - try: - return t(*args, **kwargs) - except Exception as err: - errno = getattr(err, "errno", None) - if not errno and hasattr(errno, "reason"): - errno = getattr(err.reason, "errno", None) - - if errno in skip_errnos: - skip( - "Skipping test due to known errno" - " and error {error}".format(error=err) - ) - - e_str = str(err) - - if any(m.lower() in e_str.lower() for m in _skip_on_messages): - skip( - "Skipping test because exception " - "message is known and error {error}".format(error=err) - ) - - if not isinstance(err, error_classes): - raise - - if raise_on_error or can_connect(url, error_classes): - raise - else: - skip( - "Skipping test due to lack of connectivity" - " and error {error}".format(error=err) - ) - - return wrapper - - -with_connectivity_check = network - - -@contextmanager -def assert_produces_warning( - expected_warning=Warning, - filter_level="always", - clear=None, - check_stacklevel=True, - raise_on_extra_warnings=True, -): - """ - Context manager for running code expected to either raise a specific - warning, or not raise any warnings. Verifies that the code raises the - expected warning, and that it does not raise any other unexpected - warnings. It is basically a wrapper around ``warnings.catch_warnings``. - - Parameters - ---------- - expected_warning : {Warning, False, None}, default Warning - The type of Exception raised. ``exception.Warning`` is the base - class for all warnings. To check that no warning is returned, - specify ``False`` or ``None``. - filter_level : str or None, default "always" - Specifies whether warnings are ignored, displayed, or turned - into errors. - Valid values are: - - * "error" - turns matching warnings into exceptions - * "ignore" - discard the warning - * "always" - always emit a warning - * "default" - print the warning the first time it is generated - from each location - * "module" - print the warning the first time it is generated - from each module - * "once" - print the warning the first time it is generated - - clear : str, default None - If not ``None`` then remove any previously raised warnings from - the ``__warningsregistry__`` to ensure that no warning messages are - suppressed by this context manager. If ``None`` is specified, - the ``__warningsregistry__`` keeps track of which warnings have been - shown, and does not show them again. - check_stacklevel : bool, default True - If True, displays the line that called the function containing - the warning to show were the function is called. Otherwise, the - line that implements the function is displayed. - raise_on_extra_warnings : bool, default True - Whether extra warnings not of the type `expected_warning` should - cause the test to fail. - - Examples - -------- - >>> import warnings - >>> with assert_produces_warning(): - ... warnings.warn(UserWarning()) - ... - >>> with assert_produces_warning(False): - ... warnings.warn(RuntimeWarning()) - ... - Traceback (most recent call last): - ... - AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. - >>> with assert_produces_warning(UserWarning): - ... warnings.warn(RuntimeWarning()) - Traceback (most recent call last): - ... - AssertionError: Did not see expected warning of class 'UserWarning'. - - ..warn:: This is *not* thread-safe. - """ - __tracebackhide__ = True - - with warnings.catch_warnings(record=True) as w: - - if clear is not None: - # make sure that we are clearing these warnings - # if they have happened before - # to guarantee that we will catch them - if not is_list_like(clear): - clear = [clear] - for m in clear: - try: - m.__warningregistry__.clear() - except AttributeError: - # module may not have __warningregistry__ - pass - - saw_warning = False - warnings.simplefilter(filter_level) - yield w - extra_warnings = [] - - for actual_warning in w: - if expected_warning and issubclass( - actual_warning.category, expected_warning - ): - saw_warning = True - - if check_stacklevel and issubclass( - actual_warning.category, (FutureWarning, DeprecationWarning) - ): - from inspect import getframeinfo, stack - - caller = getframeinfo(stack()[2][0]) - msg = ( - "Warning not set with correct stacklevel. " - "File where warning is raised: {actual} != " - "{caller}. Warning message: {message}" - ).format( - actual=actual_warning.filename, - caller=caller.filename, - message=actual_warning.message, - ) - assert actual_warning.filename == caller.filename, msg - else: - extra_warnings.append( - ( - actual_warning.category.__name__, - actual_warning.message, - actual_warning.filename, - actual_warning.lineno, - ) - ) - if expected_warning: - msg = "Did not see expected warning of class {name!r}.".format( - name=expected_warning.__name__ - ) - assert saw_warning, msg - if raise_on_extra_warnings and extra_warnings: - raise AssertionError( - "Caused unexpected warning(s): {!r}.".format(extra_warnings) - ) - - -class RNGContext: - """ - Context manager to set the numpy random number generator speed. Returns - to the original value upon exiting the context manager. - - Parameters - ---------- - seed : int - Seed for numpy.random.seed - - Examples - -------- - - with RNGContext(42): - np.random.randn() - """ - - def __init__(self, seed): - self.seed = seed - - def __enter__(self): - - self.start_state = np.random.get_state() - np.random.seed(self.seed) - - def __exit__(self, exc_type, exc_value, traceback): - - np.random.set_state(self.start_state) - - -@contextmanager -def with_csv_dialect(name, **kwargs): - """ - Context manager to temporarily register a CSV dialect for parsing CSV. - - Parameters - ---------- - name : str - The name of the dialect. - kwargs : mapping - The parameters for the dialect. - - Raises - ------ - ValueError : the name of the dialect conflicts with a builtin one. - - See Also - -------- - csv : Python's CSV library. - """ - import csv - - _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} - - if name in _BUILTIN_DIALECTS: - raise ValueError("Cannot override builtin dialect.") - - csv.register_dialect(name, **kwargs) - yield - csv.unregister_dialect(name) - - -@contextmanager -def use_numexpr(use, min_elements=None): - from pandas.core.computation import expressions as expr - - if min_elements is None: - min_elements = expr._MIN_ELEMENTS - - olduse = expr._USE_NUMEXPR - oldmin = expr._MIN_ELEMENTS - expr.set_use_numexpr(use) - expr._MIN_ELEMENTS = min_elements - yield - expr._MIN_ELEMENTS = oldmin - expr.set_use_numexpr(olduse) - - -def test_parallel(num_threads=2, kwargs_list=None): - """Decorator to run the same function multiple times in parallel. - - Parameters - ---------- - num_threads : int, optional - The number of times the function is run in parallel. - kwargs_list : list of dicts, optional - The list of kwargs to update original - function kwargs on different threads. - Notes - ----- - This decorator does not pass the return value of the decorated function. - - Original from scikit-image: - - https://github.com/scikit-image/scikit-image/pull/1519 - - """ - - assert num_threads > 0 - has_kwargs_list = kwargs_list is not None - if has_kwargs_list: - assert len(kwargs_list) == num_threads - import threading - - def wrapper(func): - @wraps(func) - def inner(*args, **kwargs): - if has_kwargs_list: - update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) - else: - update_kwargs = lambda i: kwargs - threads = [] - for i in range(num_threads): - updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) - threads.append(thread) - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - return inner - - return wrapper - - -class SubclassedSeries(Series): - _metadata = ["testattr", "name"] - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - -class SubclassedDataFrame(DataFrame): - _metadata = ["testattr"] - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries - - -class SubclassedCategorical(Categorical): - @property - def _constructor(self): - return SubclassedCategorical - - -@contextmanager -def set_timezone(tz): - """Context manager for temporarily setting a timezone. - - Parameters - ---------- - tz : str - A string representing a valid timezone. - - Examples - -------- - - >>> from datetime import datetime - >>> from dateutil.tz import tzlocal - >>> tzlocal().tzname(datetime.now()) - 'IST' - - >>> with set_timezone('US/Eastern'): - ... tzlocal().tzname(datetime.now()) - ... - 'EDT' - """ - - import os - import time - - def setTZ(tz): - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() - - orig_tz = os.environ.get("TZ") - setTZ(tz) - try: - yield - finally: - setTZ(orig_tz) - - -def _make_skipna_wrapper(alternative, skipna_alternative=None): - """Create a function for calling on an array. - - Parameters - ---------- - alternative : function - The function to be called on the array with no NaNs. - Only used when 'skipna_alternative' is None. - skipna_alternative : function - The function to be called on the original array - - Returns - ------- - skipna_wrapper : function - """ - if skipna_alternative: - - def skipna_wrapper(x): - return skipna_alternative(x.values) - - else: - - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - - return skipna_wrapper - - -def convert_rows_list_to_csv_str(rows_list): - """ - Convert list of CSV rows to single CSV-formatted string for current OS. - - This method is used for creating expected value of to_csv() method. - - Parameters - ---------- - rows_list : list - The list of string. Each element represents the row of csv. - - Returns - ------- - expected : string - Expected output of to_csv() in current OS - """ - sep = os.linesep - expected = sep.join(rows_list) + sep - return expected diff --git a/pyproject.toml b/pyproject.toml index 2ec4739c2f7f8..28d7c3d55c919 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,10 +5,28 @@ requires = [ "setuptools", "wheel", "Cython>=0.29.13", # Note: sync with setup.py - "numpy==1.13.3; python_version=='3.5' and platform_system!='AIX'", "numpy==1.13.3; python_version=='3.6' and platform_system!='AIX'", "numpy==1.14.5; python_version>='3.7' and platform_system!='AIX'", - "numpy==1.16.0; python_version=='3.5' and platform_system=='AIX'", "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", "numpy==1.16.0; python_version>='3.7' and platform_system=='AIX'", ] + +[tool.black] +target-version = ['py36', 'py37', 'py38'] +exclude = ''' +( + asv_bench/env + | \.egg + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | setup.py +) +''' diff --git a/requirements-dev.txt b/requirements-dev.txt index 8a9974d393297..017e6258d9941 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,15 +1,18 @@ +# This file is auto-generated from environment.yml, do not modify. +# See that file for comments about the need/usage of each dependency. + numpy>=1.15 python-dateutil>=2.6.1 pytz asv cython>=0.29.13 -black +black==19.10b0 cpplint flake8 -flake8-comprehensions +flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort -mypy +mypy==0.730 pycodestyle gitpython sphinx @@ -30,10 +33,10 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=4.0.2 +pytest>=5.0.1 pytest-cov -pytest-mock -pytest-xdist +pytest-xdist>=1.21 +pytest-asyncio seaborn statsmodels ipywidgets @@ -43,26 +46,28 @@ pip blosc bottleneck>=1.2.1 ipykernel -ipython>=5.6.0 +ipython>=7.11.1 jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 +numba>=0.46.0 beautifulsoup4>=4.6.0 -fastparquet>=0.2.1 html5lib lxml -openpyxl +openpyxl<=3.0.1 +xlrd +xlsxwriter +xlwt +odfpy +fastparquet>=0.3.2 pyarrow>=0.13.1 +python-snappy pyqt5>=5.9.2 tables>=3.4.2 -python-snappy s3fs sqlalchemy xarray -xlrd -xlsxwriter -xlwt -odfpy pyreadstat +tabulate>=0.8.3 git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master \ No newline at end of file diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py index 4ca1354321134..3d36eed2d888a 100644 --- a/scripts/download_wheels.py +++ b/scripts/download_wheels.py @@ -26,7 +26,7 @@ def fetch(version): files = [ x for x in root.xpath("//a/text()") - if x.startswith("pandas-{}".format(version)) and not dest.joinpath(x).exists() + if x.startswith(f"pandas-{version}") and not dest.joinpath(x).exists() ] N = len(files) @@ -35,9 +35,7 @@ def fetch(version): out = str(dest.joinpath(filename)) link = urllib.request.urljoin(base, filename) urllib.request.urlretrieve(link, out) - print( - "Downloaded {link} to {out} [{i}/{N}]".format(link=link, out=out, i=i, N=N) - ) + print(f"Downloaded {link} to {out} [{i}/{N}]") def main(args=None): diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index f1c7c3298fb26..53a27e8782ad7 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -19,7 +19,7 @@ import yaml -EXCLUDE = {"python=3"} +EXCLUDE = {"python"} RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"} @@ -33,15 +33,15 @@ def conda_package_to_pip(package): - A package requiring a specific version, in conda is defined with a single equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``) """ - if package in EXCLUDE: - return - package = re.sub("(?<=[^<>])=", "==", package).strip() + for compare in ("<=", ">=", "=="): if compare not in package: continue pkg, version = package.split(compare) + if pkg in EXCLUDE: + return if pkg in RENAME: return "".join((RENAME[pkg], compare, version)) @@ -87,9 +87,14 @@ def main(conda_fname, pip_fname, compare=False): elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: pip_deps += dep["pip"] else: - raise ValueError("Unexpected dependency {}".format(dep)) + raise ValueError(f"Unexpected dependency {dep}") - pip_content = "\n".join(pip_deps) + fname = os.path.split(conda_fname)[1] + header = ( + f"# This file is auto-generated from {fname}, do not modify.\n" + "# See that file for comments about the need/usage of each dependency.\n\n" + ) + pip_content = header + "\n".join(pip_deps) if compare: with open(pip_fname) as pip_fd: @@ -122,13 +127,13 @@ def main(conda_fname, pip_fname, compare=False): ) if res: msg = ( - "`requirements-dev.txt` has to be generated with `{}` after " - "`environment.yml` is modified.\n".format(sys.argv[0]) + f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after " + "`environment.yml` is modified.\n" ) if args.azure: msg = ( "##vso[task.logissue type=error;" - "sourcepath=requirements-dev.txt]{}".format(msg) + f"sourcepath=requirements-dev.txt]{msg}" ) sys.stderr.write(msg) sys.exit(res) diff --git a/scripts/list_future_warnings.sh b/scripts/list_future_warnings.sh index 0c4046bbb5f49..121f4f5a92abb 100755 --- a/scripts/list_future_warnings.sh +++ b/scripts/list_future_warnings.sh @@ -25,7 +25,7 @@ EXCLUDE="^pandas/tests/|" # tests validate that FutureWarnings are raised EXCLUDE+="^pandas/util/_decorators.py$|" # generic deprecate function that raises warning EXCLUDE+="^pandas/util/_depr_module.py$|" # generic deprecate module that raises warnings -EXCLUDE+="^pandas/util/testing.py$|" # contains function to evaluate if warning is raised +EXCLUDE+="^pandas._testing.py$|" # contains function to evaluate if warning is raised EXCLUDE+="^pandas/io/parsers.py$" # implements generic deprecation system in io reading BASE_DIR="$(dirname $0)/.." diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 1506acc95edf9..a1bccb1dd1629 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -719,7 +719,7 @@ def no_type(self): def no_description(self): """ - Provides type but no descrption. + Provides type but no description. Returns ------- @@ -1300,7 +1300,7 @@ def test_resolves_class_name(self, name, expected_obj): @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) def test_raises_for_invalid_module_name(self, invalid_name): - msg = 'No module can be imported from "{}"'.format(invalid_name) + msg = f'No module can be imported from "{invalid_name}"' with pytest.raises(ImportError, match=msg): validate_docstrings.Docstring(invalid_name) @@ -1310,7 +1310,7 @@ def test_raises_for_invalid_module_name(self, invalid_name): def test_raises_for_invalid_attribute_name(self, invalid_name): name_components = invalid_name.split(".") obj_name, invalid_attr_name = name_components[-2], name_components[-1] - msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) + msg = f"'{obj_name}' has no attribute '{invalid_attr_name}'" with pytest.raises(AttributeError, match=msg): validate_docstrings.Docstring(invalid_name) diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 1d0f4b583bd0c..bcf3fd5d276f5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -15,7 +15,6 @@ """ import argparse import ast -import collections import doctest import functools import glob @@ -250,7 +249,7 @@ def __init__(self, name): self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) - def __len__(self): + def __len__(self) -> int: return len(self.raw_doc) @staticmethod @@ -286,7 +285,7 @@ def _load_obj(name): continue if "obj" not in locals(): - raise ImportError("No module can be imported " 'from "{}"'.format(name)) + raise ImportError(f'No module can be imported from "{name}"') for part in func_parts: obj = getattr(obj, part) @@ -358,7 +357,7 @@ def source_file_def_line(self): @property def github_url(self): url = "https://github.com/pandas-dev/pandas/blob/master/" - url += "{}#L{}".format(self.source_file_name, self.source_file_def_line) + url += f"{self.source_file_name}#L{self.source_file_def_line}" return url @property @@ -422,7 +421,7 @@ def needs_summary(self): @property def doc_parameters(self): - parameters = collections.OrderedDict() + parameters = {} for names, type_, desc in self.doc["Parameters"]: for name in names.split(", "): parameters[name] = (type_, "".join(desc)) @@ -502,7 +501,7 @@ def parameter_desc(self, param): desc = self.doc_parameters[param][1] # Find and strip out any sphinx directives for directive in DIRECTIVES: - full_directive = ".. {}".format(directive) + full_directive = f".. {directive}" if full_directive in desc: # Only retain any description before the directive desc = desc[: desc.index(full_directive)] @@ -510,7 +509,7 @@ def parameter_desc(self, param): @property def see_also(self): - result = collections.OrderedDict() + result = {} for funcs, desc in self.doc["See Also"]: for func, _ in funcs: result[func] = "".join(desc) @@ -826,14 +825,12 @@ def get_validation_data(doc): "EX03", error_code=err.error_code, error_message=err.message, - times_happening=" ({} times)".format(err.count) - if err.count > 1 - else "", + times_happening=f" ({err.count} times)" if err.count > 1 else "", ) ) examples_source_code = "".join(doc.examples_source_code) for wrong_import in ("numpy", "pandas"): - if "import {}".format(wrong_import) in examples_source_code: + if f"import {wrong_import}" in examples_source_code: errs.append(error("EX04", imported_library=wrong_import)) return errs, wrns, examples_errs @@ -921,7 +918,7 @@ def validate_all(prefix, ignore_deprecated=False): api_item_names = set(list(zip(*api_items))[0]) for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): - func_name = "pandas.{}.{}".format(class_.__name__, member[0]) + func_name = f"pandas.{class_.__name__}.{member[0]}" if not member[0].startswith("_") and func_name not in api_item_names: if prefix and not func_name.startswith(prefix): continue @@ -939,13 +936,9 @@ def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 adj = "" if len(title) % 2 == 0 else " " - title_line = "{side} {title}{adj} {side}".format( - side=char * side_len, title=title, adj=adj - ) + title_line = f"{char * side_len} {title}{adj} {char * side_len}" - return "\n{full_line}\n{title_line}\n{full_line}\n\n".format( - full_line=full_line, title_line=title_line - ) + return f"\n{full_line}\n{title_line}\n{full_line}\n\n" exit_status = 0 if func_name is None: @@ -965,7 +958,7 @@ def header(title, width=80, char="#"): "]{text}\n" ) else: - raise ValueError('Unknown output_format "{}"'.format(output_format)) + raise ValueError(f'Unknown output_format "{output_format}"') output = "" for name, res in result.items(): @@ -977,35 +970,34 @@ def header(title, width=80, char="#"): continue exit_status += 1 output += output_format.format( - name=name, path=res["file"], row=res["file_line"], code=err_code, - text="{}: {}".format(name, err_desc), + text=f"{name}: {err_desc}", ) sys.stdout.write(output) else: result = validate_one(func_name) - sys.stderr.write(header("Docstring ({})".format(func_name))) - sys.stderr.write("{}\n".format(result["docstring"])) + sys.stderr.write(header(f"Docstring ({func_name})")) + sys.stderr.write(f"{result['docstring']}\n") sys.stderr.write(header("Validation")) if result["errors"]: - sys.stderr.write("{} Errors found:\n".format(len(result["errors"]))) + sys.stderr.write(f"{len(result['errors'])} Errors found:\n") for err_code, err_desc in result["errors"]: # Failing examples are printed at the end if err_code == "EX02": sys.stderr.write("\tExamples do not pass tests\n") continue - sys.stderr.write("\t{}\n".format(err_desc)) + sys.stderr.write(f"\t{err_desc}\n") if result["warnings"]: - sys.stderr.write("{} Warnings found:\n".format(len(result["warnings"]))) + sys.stderr.write(f"{len(result['warnings'])} Warnings found:\n") for wrn_code, wrn_desc in result["warnings"]: - sys.stderr.write("\t{}\n".format(wrn_desc)) + sys.stderr.write(f"\t{wrn_desc}\n") if not result["errors"]: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) + sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') if result["examples_errors"]: sys.stderr.write(header("Doctests")) @@ -1029,7 +1021,7 @@ def header(title, width=80, char="#"): choices=format_opts, help="format of the output when validating " "multiple docstrings (ignored when validating one)." - "It can be {}".format(str(format_opts)[1:-1]), + f"It can be {str(format_opts)[1:-1]}", ) argparser.add_argument( "--prefix", diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py new file mode 100755 index 0000000000000..3feeddaabe8d2 --- /dev/null +++ b/scripts/validate_string_concatenation.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +""" +GH #30454 + +Check where there is a string that needs to be concatenated. + +This is necessary after black formating, +where for example black transforms this: + +>>> foo = ( +... "bar " +... "baz" +... ) + +into this: + +>>> foo = ("bar " "baz") + +Black is not considering this as an +issue (see issue https://github.com/psf/black/issues/1051), +so we are checking it here. +""" + +import argparse +import os +import sys +import token +import tokenize +from typing import Generator, List, Tuple + +FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") + + +def main(source_path: str, output_format: str) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the script. + + Returns + ------- + bool + True if found any strings that needs to be concatenated. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + if not os.path.exists(source_path): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + + is_failed: bool = False + + msg = "String unnecessarily split in two by black. Please merge them manually." + + if os.path.isfile(source_path): + for source_path, line_number in strings_to_concatenate(source_path): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + + for subdir, _, files in os.walk(source_path): + for file_name in files: + if any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + for source_path, line_number in strings_to_concatenate( + os.path.join(subdir, file_name) + ): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + return is_failed + + +def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: + """ + Yielding the strings that needs to be concatenated in a given file. + + Parameters + ---------- + source_path : str + File path pointing to a single file. + + Yields + ------ + source_path : str + Source file path. + line_number : int + Line number of unconcatenated string. + """ + with open(source_path, "r") as file_name: + tokens: List = list(tokenize.generate_tokens(file_name.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token[0] == next_token[0] == token.STRING: + yield source_path, current_token[2][0] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate concatenated strings") + + parser.add_argument( + "path", nargs="?", default=".", help="Source path of file/directory to check." + ) + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the unconcatenated strings.", + ) + + args = parser.parse_args() + + sys.exit(main(source_path=args.path, output_format=args.format)) diff --git a/setup.cfg b/setup.cfg index d4657100c1291..d0570cee6fe10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,16 +47,8 @@ ignore = E402, # module level import not at top of file E711, # comparison to none should be 'if cond is none:' exclude = - doc/source/getting_started/basics.rst doc/source/development/contributing_docstring.rst - -[yapf] -based_on_style = pep8 -split_before_named_assigns = false -split_penalty_after_opening_bracket = 1000000 -split_penalty_logical_operator = 30 - [tool:pytest] # sync minversion with setup.cfg & install.rst minversion = 4.0.2 @@ -74,12 +66,14 @@ xfail_strict = True filterwarnings = error:Sparse:FutureWarning error:The SparseArray:FutureWarning +junit_family=xunit2 [coverage:run] branch = False omit = */tests/* pandas/_typing.py + pandas/_version.py plugins = Cython.Coverage [coverage:report] @@ -111,12 +105,12 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] known_pre_libs = pandas._config -known_pre_core = pandas._libs,pandas.util._*,pandas.compat,pandas.errors +known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 @@ -129,6 +123,11 @@ skip = pandas/__init__.py,pandas/core/api.py [mypy] ignore_missing_imports=True no_implicit_optional=True +check_untyped_defs=True +strict_equality=True + +[mypy-pandas.tests.*] +check_untyped_defs=False [mypy-pandas.conftest] ignore_errors=True @@ -136,9 +135,6 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.dtypes.test_common] -ignore_errors=True - [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True @@ -148,17 +144,197 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.indexes.interval.test_interval_tree] +[mypy-pandas.tests.indexes.datetimes.test_tools] ignore_errors=True -[mypy-pandas.tests.indexes.test_base] +[mypy-pandas.tests.scalar.period.test_period] ignore_errors=True -[mypy-pandas.tests.indexing.test_loc] -ignore_errors=True +[mypy-pandas._version] +check_untyped_defs=False -[mypy-pandas.tests.series.test_operators] -ignore_errors=True +[mypy-pandas.core.arrays.categorical] +check_untyped_defs=False -[mypy-pandas.tests.tseries.offsets.test_offsets] -ignore_errors=True +[mypy-pandas.core.arrays.interval] +check_untyped_defs=False + +[mypy-pandas.core.arrays.sparse.array] +check_untyped_defs=False + +[mypy-pandas.core.base] +check_untyped_defs=False + +[mypy-pandas.core.computation.expr] +check_untyped_defs=False + +[mypy-pandas.core.computation.expressions] +check_untyped_defs=False + +[mypy-pandas.core.computation.ops] +check_untyped_defs=False + +[mypy-pandas.core.computation.pytables] +check_untyped_defs=False + +[mypy-pandas.core.computation.scope] +check_untyped_defs=False + +[mypy-pandas.core.dtypes.cast] +check_untyped_defs=False + +[mypy-pandas.core.frame] +check_untyped_defs=False + +[mypy-pandas.core.generic] +check_untyped_defs=False + +[mypy-pandas.core.groupby.generic] +check_untyped_defs=False + +[mypy-pandas.core.groupby.grouper] +check_untyped_defs=False + +[mypy-pandas.core.groupby.ops] +check_untyped_defs=False + +[mypy-pandas.core.indexes.base] +check_untyped_defs=False + +[mypy-pandas.core.indexes.datetimelike] +check_untyped_defs=False + +[mypy-pandas.core.indexes.datetimes] +check_untyped_defs=False + +[mypy-pandas.core.indexes.interval] +check_untyped_defs=False + +[mypy-pandas.core.indexes.multi] +check_untyped_defs=False + +[mypy-pandas.core.indexing] +check_untyped_defs=False + +[mypy-pandas.core.internals.blocks] +check_untyped_defs=False + +[mypy-pandas.core.internals.concat] +check_untyped_defs=False + +[mypy-pandas.core.internals.construction] +check_untyped_defs=False + +[mypy-pandas.core.internals.managers] +check_untyped_defs=False + +[mypy-pandas.core.missing] +check_untyped_defs=False + +[mypy-pandas.core.nanops] +check_untyped_defs=False + +[mypy-pandas.core.ops.docstrings] +check_untyped_defs=False + +[mypy-pandas.core.resample] +check_untyped_defs=False + +[mypy-pandas.core.reshape.merge] +check_untyped_defs=False + +[mypy-pandas.core.reshape.reshape] +check_untyped_defs=False + +[mypy-pandas.core.strings] +check_untyped_defs=False + +[mypy-pandas.core.tools.datetimes] +check_untyped_defs=False + +[mypy-pandas.core.window.common] +check_untyped_defs=False + +[mypy-pandas.core.window.ewm] +check_untyped_defs=False + +[mypy-pandas.core.window.expanding] +check_untyped_defs=False + +[mypy-pandas.core.window.rolling] +check_untyped_defs=False + +[mypy-pandas.io.clipboard] +check_untyped_defs=False + +[mypy-pandas.io.excel._base] +check_untyped_defs=False + +[mypy-pandas.io.excel._openpyxl] +check_untyped_defs=False + +[mypy-pandas.io.excel._util] +check_untyped_defs=False + +[mypy-pandas.io.excel._xlwt] +check_untyped_defs=False + +[mypy-pandas.io.formats.console] +check_untyped_defs=False + +[mypy-pandas.io.formats.css] +check_untyped_defs=False + +[mypy-pandas.io.formats.excel] +check_untyped_defs=False + +[mypy-pandas.io.formats.format] +check_untyped_defs=False + +[mypy-pandas.io.formats.style] +check_untyped_defs=False + +[mypy-pandas.io.html] +check_untyped_defs=False + +[mypy-pandas.io.json._json] +check_untyped_defs=False + +[mypy-pandas.io.json._table_schema] +check_untyped_defs=False + +[mypy-pandas.io.parsers] +check_untyped_defs=False + +[mypy-pandas.io.pytables] +check_untyped_defs=False + +[mypy-pandas.io.sas.sas_xport] +check_untyped_defs=False + +[mypy-pandas.io.sas.sas7bdat] +check_untyped_defs=False + +[mypy-pandas.io.sas.sasreader] +check_untyped_defs=False + +[mypy-pandas.io.stata] +check_untyped_defs=False + +[mypy-pandas.plotting._matplotlib.converter] +check_untyped_defs=False + +[mypy-pandas.plotting._matplotlib.core] +check_untyped_defs=False + +[mypy-pandas.plotting._matplotlib.misc] +check_untyped_defs=False + +[mypy-pandas.tseries.holiday] +check_untyped_defs=False + +[mypy-pandas.tseries.offsets] +check_untyped_defs=False + +[mypy-pandas._testing] +check_untyped_defs=False diff --git a/setup.py b/setup.py index 0dd1980088db8..c33ce063cb4d9 100755 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ BSD license. Parts are from lxml (https://github.com/lxml/lxml) """ +import argparse from distutils.sysconfig import get_config_vars from distutils.version import LooseVersion import os @@ -38,9 +39,9 @@ def is_platform_mac(): "install_requires": [ "python-dateutil >= 2.6.1", "pytz >= 2017.2", - "numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver), + f"numpy >= {min_numpy_ver}", ], - "setup_requires": ["numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver)], + "setup_requires": [f"numpy >= {min_numpy_ver}"], "zip_safe": False, } @@ -48,11 +49,12 @@ def is_platform_mac(): try: import Cython - ver = Cython.__version__ + _CYTHON_VERSION = Cython.__version__ from Cython.Build import cythonize - _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver) + _CYTHON_INSTALLED = _CYTHON_VERSION >= LooseVersion(min_cython_ver) except ImportError: + _CYTHON_VERSION = None _CYTHON_INSTALLED = False cythonize = lambda x, *args, **kwargs: x # dummy func @@ -62,31 +64,19 @@ def is_platform_mac(): from distutils.extension import Extension # noqa: E402 isort:skip from distutils.command.build import build # noqa: E402 isort:skip -try: - if not _CYTHON_INSTALLED: - raise ImportError("No supported version of Cython installed.") +if _CYTHON_INSTALLED: from Cython.Distutils.old_build_ext import old_build_ext as _build_ext cython = True -except ImportError: + from Cython import Tempita as tempita +else: from distutils.command.build_ext import build_ext as _build_ext cython = False -else: - try: - try: - from Cython import Tempita as tempita - except ImportError: - import tempita - except ImportError: - raise ImportError("Building pandas requires Tempita: pip install Tempita") _pxi_dep_template = { - "algos": [ - "_libs/algos_common_helper.pxi.in", - "_libs/algos_take_helper.pxi.in", - ], + "algos": ["_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -132,12 +122,7 @@ def build_extensions(self): if cython: self.render_templates(_pxifiles) - numpy_incl = pkg_resources.resource_filename("numpy", "core/include") - - for ext in self.extensions: - if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: - ext.include_dirs.append(numpy_incl) - _build_ext.build_extensions(self) + super().build_extensions() DESCRIPTION = "Powerful data structures for data analysis, time series, and statistics" @@ -165,7 +150,7 @@ def build_extensions(self): (2-dimensional), handle the vast majority of typical use cases in finance, statistics, social science, and many areas of engineering. For R users, DataFrame provides everything that R's ``data.frame`` provides and much -more. pandas is built on top of `NumPy `__ and is +more. pandas is built on top of `NumPy `__ and is intended to integrate well within a scientific computing environment with many other 3rd party libraries. @@ -194,8 +179,7 @@ def build_extensions(self): Excel files, databases, and saving / loading data from the ultrafast **HDF5 format** - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, moving window linear regressions, - date shifting and lagging, etc. + conversion, moving window statistics, date shifting and lagging. Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments. For data @@ -209,11 +193,11 @@ def build_extensions(self): LICENSE = "BSD" AUTHOR = "The PyData Development Team" EMAIL = "pydata@googlegroups.com" -URL = "http://pandas.pydata.org" +URL = "https://pandas.pydata.org" DOWNLOAD_URL = "" PROJECT_URLS = { "Bug Tracker": "https://github.com/pandas-dev/pandas/issues", - "Documentation": "http://pandas.pydata.org/pandas-docs/stable/", + "Documentation": "https://pandas.pydata.org/pandas-docs/stable/", "Source Code": "https://github.com/pandas-dev/pandas", } CLASSIFIERS = [ @@ -223,7 +207,6 @@ def build_extensions(self): "Intended Audience :: Science/Research", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", @@ -330,7 +313,6 @@ class CheckSDist(sdist_class): "pandas/_libs/missing.pyx", "pandas/_libs/reduction.pyx", "pandas/_libs/testing.pyx", - "pandas/_libs/skiplist.pyx", "pandas/_libs/sparse.pyx", "pandas/_libs/ops.pyx", "pandas/_libs/parsers.pyx", @@ -349,14 +331,13 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", ] _cpp_pyxfiles = [ - "pandas/_libs/window.pyx", - "pandas/io/msgpack/_packer.pyx", - "pandas/io/msgpack/_unpacker.pyx", + "pandas/_libs/window/aggregations.pyx", ] def initialize_options(self): @@ -374,10 +355,8 @@ def run(self): for pyxfile in pyxfiles: sourcefile = pyxfile[:-3] + extension msg = ( - "{extension}-source file '{source}' not found.\n" - "Run 'setup.py cython' before sdist.".format( - source=sourcefile, extension=extension - ) + f"{extension}-source file '{sourcefile}' not found.\n" + f"Run 'setup.py cython' before sdist." ) assert os.path.isfile(sourcefile), msg sdist_class.run(self) @@ -392,14 +371,12 @@ def check_cython_extensions(self, extensions): for ext in extensions: for src in ext.sources: if not os.path.exists(src): - print("{}: -> [{}]".format(ext.name, ext.sources)) + print(f"{ext.name}: -> [{ext.sources}]") raise Exception( - """Cython-generated file '{src}' not found. + f"""Cython-generated file '{src}' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. - """.format( - src=src - ) + """ ) def build_extensions(self): @@ -467,7 +444,7 @@ def run(self): extra_link_args.append("/DEBUG") else: # args to ignore warnings - extra_compile_args = ["-Wno-unused-function"] + extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("-g") @@ -528,12 +505,39 @@ def maybe_cythonize(extensions, *args, **kwargs): # See https://github.com/cython/cython/issues/1495 return extensions + elif not cython: + # GH#28836 raise a helfpul error message + if _CYTHON_VERSION: + raise RuntimeError( + f"Cannot cythonize with old Cython version ({_CYTHON_VERSION} " + f"installed, needs {min_cython_ver})" + ) + raise RuntimeError("Cannot cythonize without Cython installed.") + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") # TODO: Is this really necessary here? for ext in extensions: if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) + # reuse any parallel arguments provided for compilation to cythonize + parser = argparse.ArgumentParser() + parser.add_argument("-j", type=int) + parser.add_argument("--parallel", type=int) + parsed, _ = parser.parse_known_args() + + nthreads = 0 + if parsed.parallel: + nthreads = parsed.parallel + elif parsed.j: + nthreads = parsed.j + + # GH#30356 Cythonize doesn't support parallel on Windows + if is_platform_windows() and nthreads > 0: + print("Parallel build for cythonize ignored on Windows") + nthreads = 0 + + kwargs["nthreads"] = nthreads build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs) @@ -542,55 +546,51 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): return pjoin("pandas", subdir, name + suffix) -common_include = ["pandas/_libs/src/klib", "pandas/_libs/src"] -ts_include = ["pandas/_libs/tslibs/src", "pandas/_libs/tslibs"] - +lib_depends = ["pandas/_libs/src/parse_helper.h"] -lib_depends = ["pandas/_libs/src/parse_helper.h", "pandas/_libs/src/compat_helper.h"] +klib_include = ["pandas/_libs/src/klib"] -np_datetime_headers = [ +tseries_depends = [ "pandas/_libs/tslibs/src/datetime/np_datetime.h", "pandas/_libs/tslibs/src/datetime/np_datetime_strings.h", ] -np_datetime_sources = [ - "pandas/_libs/tslibs/src/datetime/np_datetime.c", - "pandas/_libs/tslibs/src/datetime/np_datetime_strings.c", -] - -tseries_depends = np_datetime_headers - ext_data = { - "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, + "_libs.algos": { + "pyxfile": "_libs/algos", + "include": klib_include, + "depends": _pxi_dep["algos"], + }, "_libs.groupby": {"pyxfile": "_libs/groupby"}, - "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, + "_libs.hashing": {"pyxfile": "_libs/hashing", "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", + "include": klib_include, "depends": (["pandas/_libs/src/klib/khash_python.h"] + _pxi_dep["hashtable"]), }, "_libs.index": { "pyxfile": "_libs/index", - "include": common_include + ts_include, + "include": klib_include, "depends": _pxi_dep["index"], - "sources": np_datetime_sources, }, "_libs.indexing": {"pyxfile": "_libs/indexing"}, "_libs.internals": {"pyxfile": "_libs/internals"}, - "_libs.interval": {"pyxfile": "_libs/interval", "depends": _pxi_dep["interval"]}, - "_libs.join": {"pyxfile": "_libs/join"}, + "_libs.interval": { + "pyxfile": "_libs/interval", + "include": klib_include, + "depends": _pxi_dep["interval"], + }, + "_libs.join": {"pyxfile": "_libs/join", "include": klib_include}, "_libs.lib": { "pyxfile": "_libs/lib", - "include": common_include + ts_include, "depends": lib_depends + tseries_depends, + "include": klib_include, # due to tokenizer import "sources": ["pandas/_libs/src/parser/tokenizer.c"], }, - "_libs.missing": { - "pyxfile": "_libs/missing", - "include": common_include + ts_include, - "depends": tseries_depends, - }, + "_libs.missing": {"pyxfile": "_libs/missing", "depends": tseries_depends}, "_libs.parsers": { "pyxfile": "_libs/parsers", + "include": klib_include + ["pandas/_libs/src"], "depends": [ "pandas/_libs/src/parser/tokenizer.h", "pandas/_libs/src/parser/io.h", @@ -602,123 +602,81 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): }, "_libs.reduction": {"pyxfile": "_libs/reduction"}, "_libs.ops": {"pyxfile": "_libs/ops"}, - "_libs.properties": {"pyxfile": "_libs/properties", "include": []}, + "_libs.ops_dispatch": {"pyxfile": "_libs/ops_dispatch"}, + "_libs.properties": {"pyxfile": "_libs/properties"}, "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, - "_libs.skiplist": { - "pyxfile": "_libs/skiplist", - "depends": ["pandas/_libs/src/skiplist.h"], - }, "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]}, - "_libs.tslib": { - "pyxfile": "_libs/tslib", - "include": ts_include, - "depends": tseries_depends, - "sources": np_datetime_sources, - }, + "_libs.tslib": {"pyxfile": "_libs/tslib", "depends": tseries_depends}, "_libs.tslibs.c_timestamp": { "pyxfile": "_libs/tslibs/c_timestamp", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, - "_libs.tslibs.ccalendar": {"pyxfile": "_libs/tslibs/ccalendar", "include": []}, + "_libs.tslibs.ccalendar": {"pyxfile": "_libs/tslibs/ccalendar"}, "_libs.tslibs.conversion": { "pyxfile": "_libs/tslibs/conversion", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, "_libs.tslibs.fields": { "pyxfile": "_libs/tslibs/fields", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, - "_libs.tslibs.frequencies": {"pyxfile": "_libs/tslibs/frequencies", "include": []}, - "_libs.tslibs.nattype": {"pyxfile": "_libs/tslibs/nattype", "include": []}, + "_libs.tslibs.frequencies": {"pyxfile": "_libs/tslibs/frequencies"}, + "_libs.tslibs.nattype": {"pyxfile": "_libs/tslibs/nattype"}, "_libs.tslibs.np_datetime": { "pyxfile": "_libs/tslibs/np_datetime", - "include": ts_include, - "depends": np_datetime_headers, - "sources": np_datetime_sources, + "depends": tseries_depends, + "sources": [ + "pandas/_libs/tslibs/src/datetime/np_datetime.c", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.c", + ], }, "_libs.tslibs.offsets": { "pyxfile": "_libs/tslibs/offsets", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, "_libs.tslibs.parsing": { "pyxfile": "_libs/tslibs/parsing", + "include": klib_include, "depends": ["pandas/_libs/src/parser/tokenizer.h"], "sources": ["pandas/_libs/src/parser/tokenizer.c"], }, "_libs.tslibs.period": { "pyxfile": "_libs/tslibs/period", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, "_libs.tslibs.resolution": { "pyxfile": "_libs/tslibs/resolution", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, "_libs.tslibs.strptime": { "pyxfile": "_libs/tslibs/strptime", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, "_libs.tslibs.timedeltas": { "pyxfile": "_libs/tslibs/timedeltas", - "include": ts_include, - "depends": np_datetime_headers, - "sources": np_datetime_sources, + "depends": tseries_depends, }, "_libs.tslibs.timestamps": { "pyxfile": "_libs/tslibs/timestamps", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, - "_libs.tslibs.timezones": {"pyxfile": "_libs/tslibs/timezones", "include": []}, + "_libs.tslibs.timezones": {"pyxfile": "_libs/tslibs/timezones"}, "_libs.tslibs.tzconversion": { "pyxfile": "_libs/tslibs/tzconversion", - "include": ts_include, "depends": tseries_depends, - "sources": np_datetime_sources, }, "_libs.testing": {"pyxfile": "_libs/testing"}, - "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, - "_libs.writers": {"pyxfile": "_libs/writers"}, - "io.sas._sas": {"pyxfile": "io/sas/sas"}, - "io.msgpack._packer": { - "macros": endian_macro + macros, - "depends": [ - "pandas/_libs/src/msgpack/pack.h", - "pandas/_libs/src/msgpack/pack_template.h", - ], - "include": ["pandas/_libs/src/msgpack"] + common_include, - "language": "c++", - "suffix": ".cpp", - "pyxfile": "io/msgpack/_packer", - "subdir": "io/msgpack", - }, - "io.msgpack._unpacker": { - "depends": [ - "pandas/_libs/src/msgpack/unpack.h", - "pandas/_libs/src/msgpack/unpack_define.h", - "pandas/_libs/src/msgpack/unpack_template.h", - ], - "macros": endian_macro + macros, - "include": ["pandas/_libs/src/msgpack"] + common_include, + "_libs.window.aggregations": { + "pyxfile": "_libs/window/aggregations", "language": "c++", "suffix": ".cpp", - "pyxfile": "io/msgpack/_unpacker", - "subdir": "io/msgpack", + "depends": ["pandas/_libs/src/skiplist.h"], }, + "_libs.window.indexers": {"pyxfile": "_libs/window/indexers"}, + "_libs.writers": {"pyxfile": "_libs/writers"}, + "io.sas._sas": {"pyxfile": "io/sas/sas"}, } extensions = [] @@ -730,10 +688,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): sources.extend(data.get("sources", [])) - include = data.get("include", common_include) + include = data.get("include") obj = Extension( - "pandas.{name}".format(name=name), + f"pandas.{name}", sources=sources, depends=data.get("depends", []), include_dirs=include, @@ -766,7 +724,10 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "pandas/_libs/src/ujson/lib/ultrajsonenc.c", "pandas/_libs/src/ujson/lib/ultrajsondec.c", ] - + np_datetime_sources + + [ + "pandas/_libs/tslibs/src/datetime/np_datetime.c", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.c", + ] ), include_dirs=[ "pandas/_libs/src/ujson/python", @@ -781,19 +742,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): extensions.append(ujson_ext) -# ---------------------------------------------------------------------- -# util -# extension for pseudo-safely moving bytes into mutable buffers -_move_ext = Extension( - "pandas.util._move", - depends=[], - sources=["pandas/util/move.c"], - define_macros=macros, - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args, -) -extensions.append(_move_ext) - # ---------------------------------------------------------------------- @@ -817,7 +765,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, platforms="any", - python_requires=">=3.5.3", + python_requires=">=3.6.1", extras_require={ "test": [ # sync with setup.cfg minversion & install.rst @@ -829,5 +777,5 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): entry_points={ "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, - **setuptools_kwargs + **setuptools_kwargs, ) diff --git a/web/pandas/about/index.html b/web/pandas/about/index.md similarity index 95% rename from web/pandas/about/index.html rename to web/pandas/about/index.md index 4e50d280d2a10..9a0a3923a6b82 100644 --- a/web/pandas/about/index.html +++ b/web/pandas/about/index.md @@ -49,8 +49,8 @@ high-dimensional data in a lower-dimensional data structure; - **Time series**-functionality: date range generation and frequency - conversion, moving window statistics, moving window linear regressions, date - shifting and lagging. Even create domain-specific time offsets and join time + conversion, moving window statistics, date shifting and lagging. + Even create domain-specific time offsets and join time series without losing data; - Highly **optimized for performance**, with critical code paths written in diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index cf242e86f879f..af6fd1ac77605 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -86,12 +86,12 @@ models to emphasize patterns in a dataset. ### [yhat/ggpy](https://github.com/yhat/ggpy) -Hadley Wickham\'s [ggplot2](https://ggplot2.tidyverse.org/) is a +Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a foundational exploratory visualization package for the R language. Based -on [\"The Grammar of -Graphics\"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) +on ["The Grammar of +Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) it provides a powerful, declarative and extremely general way to -generate bespoke plots of any kind of data. It\'s really quite +generate bespoke plots of any kind of data. It's really quite incredible. Various implementations to other languages are available, but a faithful implementation for Python users has long been missing. Although still young (as of Jan-2014), the @@ -100,9 +100,7 @@ quickly in that direction. ### [IPython Vega](https://github.com/vega/ipyvega) -[IPython Vega](https://github.com/vega/ipyvega) leverages [Vega -\]\_\_ to create plots -within Jupyter Notebook. +[IPython Vega](https://github.com/vega/ipyvega) leverages [Vega](https://github.com/vega/vega) to create plots within Jupyter Notebook. ### [Plotly](https://plot.ly/python) @@ -158,8 +156,8 @@ for pandas `display.` settings. ### [quantopian/qgrid](https://github.com/quantopian/qgrid) -qgrid is \"an interactive grid for sorting and filtering DataFrames in -IPython Notebook\" built with SlickGrid. +qgrid is "an interactive grid for sorting and filtering DataFrames in +IPython Notebook" built with SlickGrid. ### [Spyder](https://www.spyder-ide.org/) @@ -172,8 +170,8 @@ environment like MATLAB or Rstudio. Its [Variable Explorer](https://docs.spyder-ide.org/variableexplorer.html) allows users to view, manipulate and edit pandas `Index`, `Series`, and -`DataFrame` objects like a \"spreadsheet\", including copying and -modifying values, sorting, displaying a \"heatmap\", converting data +`DataFrame` objects like a "spreadsheet", including copying and +modifying values, sorting, displaying a "heatmap", converting data types and more. Pandas objects can also be renamed, duplicated, new columns added, copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety @@ -181,8 +179,8 @@ of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. Most pandas classes, methods and data attributes can be autocompleted in -Spyder\'s [Editor](https://docs.spyder-ide.org/editor.html) and [IPython -Console](https://docs.spyder-ide.org/ipythonconsole.html), and Spyder\'s +Spyder's [Editor](https://docs.spyder-ide.org/editor.html) and [IPython +Console](https://docs.spyder-ide.org/ipythonconsole.html), and Spyder's [Help pane](https://docs.spyder-ide.org/help.html) can retrieve and render Numpydoc documentation on pandas objects in rich text with Sphinx both automatically and on-demand. @@ -355,7 +353,7 @@ which work well with pandas' data containers. ### [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) Cyberpandas provides an extension type for storing arrays of IP -Addresses. These arrays can be stored inside pandas\' Series and +Addresses. These arrays can be stored inside pandas' Series and DataFrame. ## Accessors @@ -364,7 +362,7 @@ A directory of projects providing `extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. - Library Accessor Classes - ------------------------------------------------------------- ---------- ----------------------- - [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) `ip` `Series` - [pdvega](https://altair-viz.github.io/pdvega/) `vgplot` `Series`, `DataFrame` + | Library | Accessor | Classes | + | ------------------------------------------------------------|----------|-----------------------| + | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | + | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | diff --git a/web/pandas/contribute.md b/web/pandas/contribute.md index 825a5870bf5a0..9f4ebaf97598c 100644 --- a/web/pandas/contribute.md +++ b/web/pandas/contribute.md @@ -1,12 +1,55 @@ # Contribute to pandas -_pandas_ is and always will be **free**. To make the development sustainable, we need _pandas_ users, corporate -or individual, to support the development by providing their time and money. +_pandas_ is and will always be **free**. To make the development sustainable, we need _pandas_ users, corporate +and individual, to support the development by providing their time and money. You can find more information about current developers in the [team page](about/team.html), and about current sponsors in the [sponsors page](about/sponsors.html). -Financial contributions will mainly be used to advance in the [pandas roadmap](about/roadmap.html). -- If your **company or organization** is interested in helping make pandas better, please contact us at [info@numfocus.org](mailto:info@numfocus.org) -- If you want to contribute to _pandas_ with your **time**, please visit the [contributing page]({{ base_url }}/docs/development/index.html) -- If you want to support _pandas_ with a **donation**, please use the [donations page](donate.html). +
+
+
+
+ + + + +

Corporate support

+

+ pandas depends on companies and institutions using the software to support its development. Hiring + people to work on pandas, or letting existing employees to contribute to the + software. Or sponsoring pandas with funds, so the project can hire people to + progress on the pandas roadmap. +

+

More information in the sponsors page

+
+
+ + + + +

Individual contributors

+

+ pandas is mostly developed by volunteers. All kind of contributions are welcome, + such as contributions to the code, to the website (including graphical designers), + to the documentation (including translators) and others. There are tasks for all + levels, including beginners. +

+

More information in the contributing page

+
+
+ + + + +

Donations

+

+ Individual donations are appreciated, and are used for things like the project + infrastructure, travel expenses for our volunteer contributors to attend + the in-person sprints, or to give small grants to develop features. +

+

Make your donation in the donate page

+
+
+
+
diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index 9682cf90cad6f..4195cc00b2419 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -9,7 +9,7 @@ the [advanced installation page]({{ base_url}}/docs/getting_started/install.html 1. Download [Anaconda](https://www.anaconda.com/distribution/) for your operating system and the latest Python version, run the installer, and follow the steps. Detailed instructions on how to install Anaconda can be found in the - [Anaconda documentation](https://docs.anaconda.com/anaconda/install/)). + [Anaconda documentation](https://docs.anaconda.com/anaconda/install/). 2. In the Anaconda prompt (or terminal in Linux or MacOS), start JupyterLab: diff --git a/web/pandas/index.html b/web/pandas/index.html index df6e5ab9a330b..5aac5da16295b 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -35,7 +35,7 @@
Documentation
Community
diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css index 8b5905d480ac3..b27ec6d42cc11 100644 --- a/web/pandas/static/css/pandas.css +++ b/web/pandas/static/css/pandas.css @@ -23,6 +23,12 @@ a { code { white-space: pre; } +.blue { + color: #150458; +} +.pink { + color: #e70488; +} .fab { font-size: 1.2rem; color: #666;