diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c00cec450c85e..48a3c464b8fe2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - master - - 1.1.x + - 1.2.x env: ENV_FILE: environment.yml @@ -64,7 +64,7 @@ jobs: - name: Testing docstring validation script run: | source activate pandas-dev - pytest --capture=no --strict scripts + pytest --capture=no --strict-markers scripts if: always() - name: Running benchmarks diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml new file mode 100644 index 0000000000000..c2f332cc5454a --- /dev/null +++ b/.github/workflows/database.yml @@ -0,0 +1,186 @@ +name: Database + +on: + push: + branches: [master] + pull_request: + branches: + - master + - 1.2.x + +env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: ((not slow and not network and not clipboard) or (single and db)) + +jobs: + Linux_py37_locale: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + env: + ENV_FILE: ci/deps/actions-37-locale.yaml + LOCALE_OVERRIDE: zh_CN.UTF-8 + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Cache conda + uses: actions/cache@v1 + env: + CACHE_NUMBER: 0 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ + hashFiles('${{ env.ENV_FILE }}') }} + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Environment Detail + run: | + conda info + conda list + + - name: Build Pandas + run: | + python setup.py build_ext -j 2 + python -m pip install -e . --no-build-isolation --no-use-pep517 + + - name: Test + run: ci/run_tests.sh + if: always() + + - name: Build Version + run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + + - name: Publish test results + uses: actions/upload-artifact@master + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Print skipped tests + run: python ci/print_skipped.py + + Linux_py37_cov: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + env: + ENV_FILE: ci/deps/actions-37-cov.yaml + PANDAS_TESTING_MODE: deprecate + COVERAGE: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Cache conda + uses: actions/cache@v1 + env: + CACHE_NUMBER: 0 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ + hashFiles('${{ env.ENV_FILE }}') }} + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Environment Detail + run: | + conda info + conda list + + - name: Build Pandas + run: | + python setup.py build_ext -j 2 + python -m pip install -e . --no-build-isolation --no-use-pep517 + + - name: Test + run: ci/run_tests.sh + if: always() + + - name: Build Version + run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + + - name: Publish test results + uses: actions/upload-artifact@master + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Print skipped tests + run: python ci/print_skipped.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 717334bfe1299..90d65327ea980 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -144,6 +144,11 @@ repos: \#\ type:\s?ignore(?!\[) language: pygrep types: [python] + - id: np-bool + name: Check for use of np.bool instead of np.bool_ + entry: np\.bool[^_8] + language: pygrep + types_or: [python, cython, rst] - id: no-os-remove name: Check code for instances of os.remove entry: os\.remove diff --git a/.travis.yml b/.travis.yml index 1ddd886699d38..8ede978074a9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,13 +16,13 @@ services: # travis cache --delete inside the project directory from the travis command line client # The cache directories will be deleted if anything in ci/ changes in a commit cache: + apt: true ccache: true directories: - $HOME/.cache # cython cache env: global: - - PYTEST_WORKERS="auto" # create a github personal access token # cd pandas-dev/pandas # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas @@ -35,31 +35,10 @@ matrix: fast_finish: true include: - - env: - - JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1" - services: - - mysql - - postgresql - - - env: - - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" - services: - - mysql - - postgresql - - arch: arm64 env: - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - - env: - # Enabling Deprecations when running tests - # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs - # See pandas/_testing.py for more details. - - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" - services: - - mysql - - postgresql - allow_failures: # Moved to allowed_failures 2020-09-29 due to timeouts https://github.com/pandas-dev/pandas/issues/36719 - arch: arm64 diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 03480ae198345..65e52e03c43c7 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,6 @@ from pandas._libs import lib import pandas as pd -from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -175,15 +174,4 @@ def time_argsort(self, N): self.array.argsort() -class RemoveDuplicates: - def setup(self): - N = 10 ** 5 - na = np.arange(int(N / 2)) - self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) - self.right = np.concatenate([na, na]) - - def time_make_duplicates_of_left_unique_in_right(self): - make_duplicates_of_left_unique_in_right(self.left, self.right) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ce63ff8badca..6cc8e15786795 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -625,7 +625,7 @@ class TransformBools: def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros(N, dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool_) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({"signal": np.random.rand(N)}) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..96f02d37db1e1 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -40,7 +40,7 @@ def time_write_excel(self, engine): class ReadExcel: - params = ["xlrd", "openpyxl", "odf"] + params = ["openpyxl", "odf"] param_names = ["engine"] fname_excel = "spreadsheet.xlsx" fname_odf = "spreadsheet.ods" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c49742095e1d8..464bad7884362 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,11 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master -- 1.1.x +- 1.2.x pr: - master -- 1.1.x +- 1.2.x variables: PYTEST_WORKERS: auto diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 8e44db0b4bcd4..4cb4eaf95f6f5 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -43,6 +43,11 @@ jobs: CONDA_PY: "38" PATTERN: "not slow and not network and not clipboard" + py38_slow: + ENV_FILE: ci/deps/azure-38-slow.yaml + CONDA_PY: "38" + PATTERN: "slow" + py38_locale: ENV_FILE: ci/deps/azure-38-locale.yaml CONDA_PY: "38" diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/actions-37-cov.yaml similarity index 96% rename from ci/deps/travis-37-cov.yaml rename to ci/deps/actions-37-cov.yaml index c89b42ef06a2e..5381caaa242cf 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/actions-37-cov.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.3.2 + - fastparquet>=0.4.0 - fsspec>=0.7.4 - gcsfs>=0.6.0 - geopandas @@ -43,7 +43,7 @@ dependencies: - sqlalchemy - statsmodels - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/actions-37-locale.yaml similarity index 79% rename from ci/deps/travis-37-locale.yaml rename to ci/deps/actions-37-locale.yaml index 4e442b10482a7..551308f1d5fac 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/actions-37-locale.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.7.* @@ -12,15 +11,15 @@ dependencies: - hypothesis>=3.58.0 # required - - numpy + - numpy<1.20 # GH#39541 compat for pyarrow<3 - python-dateutil - pytz # optional - beautifulsoup4 - - blosc=1.15.0 + - blosc=1.17.0 - python-blosc - - fastparquet=0.3.2 + - fastparquet=0.4.0 - html5lib - ipython - jinja2 @@ -31,11 +30,11 @@ dependencies: - openpyxl - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - - pyarrow>=0.17 + - pyarrow=0.17 # GH 38803 - pytables>=3.5.1 - scipy - xarray=0.12.3 - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto @@ -43,5 +42,5 @@ dependencies: # sql - psycopg2=2.7 - - pymysql=0.7.11 + - pymysql=0.8.1 - sqlalchemy=1.3.0 diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index 50fccf86b6340..05b33fa351ac9 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -31,7 +31,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/azure-37.yaml b/ci/deps/azure-37.yaml index 82cb6760b6d1e..4fe3de161960c 100644 --- a/ci/deps/azure-37.yaml +++ b/ci/deps/azure-37.yaml @@ -18,7 +18,7 @@ dependencies: - numpy - python-dateutil - nomkl - - pyarrow + - pyarrow=0.15.1 - pytz - s3fs>=0.4.0 - moto>=1.3.14 diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index f879111a32e67..26297a3066fa5 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -18,19 +18,20 @@ dependencies: - html5lib - ipython - jinja2 + - jedi<0.18.0 - lxml - matplotlib <3.3.0 - moto - nomkl - numexpr - - numpy + - numpy<1.20 # GH#39541 compat with pyarrow<3 - openpyxl - pytables - python-dateutil - pytz - scipy - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/travis-38-slow.yaml b/ci/deps/azure-38-slow.yaml similarity index 95% rename from ci/deps/travis-38-slow.yaml rename to ci/deps/azure-38-slow.yaml index e4b719006a11e..fd40f40294b7f 100644 --- a/ci/deps/travis-38-slow.yaml +++ b/ci/deps/azure-38-slow.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.8.* @@ -30,7 +29,7 @@ dependencies: - moto>=1.3.14 - scipy - sqlalchemy - - xlrd + - xlrd>=2.0 - xlsxwriter - xlwt - moto diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 31e0ffca81424..d667adddda859 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -21,12 +21,12 @@ dependencies: - numexpr - numpy=1.16.5 - openpyxl - - pyarrow>=0.15.0 + - pyarrow=0.15.1 - pytables - python-dateutil==2.7.3 - pytz - xarray - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt - pip diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 16b4bd72683b4..e7ac4c783b855 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -33,7 +33,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy - - xlrd + - xlrd>=2.0 - xlsxwriter - xlwt - pyreadstat diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 449bbd05991bf..661d8813d32d2 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.3.2 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0 - matplotlib=3.1.3 @@ -31,6 +31,6 @@ dependencies: - pytz - s3fs>=0.4.0 - scipy - - xlrd + - xlrd<2.0 - xlsxwriter - xlwt diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 78d24c814840a..593939431d5eb 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -20,7 +20,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" +PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict-markers --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then # GH#37455 windows py38 build appears to be running out of memory diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 43cd631890330..403d182e3d3e5 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -1,3 +1,10 @@ +/* Override some aspects of the pydata-sphinx-theme */ + +:root { + /* Use softer blue from bootstrap's default info color */ + --color-info: 23, 162, 184; +} + /* Getting started index page */ .intro-card { diff --git a/doc/source/conf.py b/doc/source/conf.py index 15e7a13ff5b72..8ab1c8c2f3428 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -427,7 +427,7 @@ ipython_warning_is_error = False -ipython_exec_lines = [ +ipython_execlines = [ "import numpy as np", "import pandas as pd", # This ensures correct rendering on system with console encoding != utf8 @@ -687,6 +687,30 @@ def process_class_docstrings(app, what, name, obj, options, lines): lines[:] = joined.split("\n") +_BUSINED_ALIASES = [ + "pandas.tseries.offsets." + name + for name in [ + "BDay", + "CDay", + "BMonthEnd", + "BMonthBegin", + "CBMonthEnd", + "CBMonthBegin", + ] +] + + +def process_business_alias_docstrings(app, what, name, obj, options, lines): + """ + Starting with sphinx 3.4, the "autodoc-process-docstring" event also + gets called for alias classes. This results in numpydoc adding the + methods/attributes to the docstring, which we don't want (+ this + causes warnings with sphinx). + """ + if name in _BUSINED_ALIASES: + lines[:] = [] + + suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just @@ -716,6 +740,7 @@ def setup(app): app.connect("source-read", rstjinja) app.connect("autodoc-process-docstring", remove_flags_docstring) app.connect("autodoc-process-docstring", process_class_docstrings) + app.connect("autodoc-process-docstring", process_business_alias_docstrings) app.add_autodocumenter(AccessorDocumenter) app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst new file mode 100644 index 0000000000000..358c4036df961 --- /dev/null +++ b/doc/source/development/debugging_extensions.rst @@ -0,0 +1,93 @@ +.. _debugging_c_extensions: + +{{ header }} + +====================== +Debugging C extensions +====================== + +Pandas uses select C extensions for high performance IO operations. In case you need to debug segfaults or general issues with those extensions, the following steps may be helpful. + +First, be sure to compile the extensions with the appropriate flags to generate debug symbols and remove optimizations. This can be achieved as follows: + +.. code-block:: sh + + python setup.py build_ext --inplace -j4 --with-debugging-symbols + +Using a debugger +================ + +Assuming you are on a Unix-like operating system, you can use either lldb or gdb to debug. The choice between either is largely dependent on your compilation toolchain - typically you would use lldb if using clang and gdb if using gcc. For macOS users, please note that ``gcc`` is on modern systems an alias for ``clang``, so if using Xcode you usually opt for lldb. Regardless of which debugger you choose, please refer to your operating systems instructions on how to install. + +After installing a debugger you can create a script that hits the extension module you are looking to debug. For demonstration purposes, let's assume you have a script called ``debug_testing.py`` with the following contents: + +.. code-block:: python + + import pandas as pd + + pd.DataFrame([[1, 2]]).to_json() + +Place the ``debug_testing.py`` script in the project root and launch a Python process under your debugger. If using lldb: + +.. code-block:: sh + + lldb python + +If using gdb: + +.. code-block:: sh + + gdb python + +Before executing our script, let's set a breakpoint in our JSON serializer in its entry function called ``objToJSON``. The lldb syntax would look as follows: + +.. code-block:: sh + + breakpoint set --name objToJSON + +Similarly for gdb: + +.. code-block:: sh + + break objToJSON + +.. note:: + + You may get a warning that this breakpoint cannot be resolved in lldb. gdb may give a similar warning and prompt you to make the breakpoint on a future library load, which you should say yes to. This should only happen on the very first invocation as the module you wish to debug has not yet been loaded into memory. + +Now go ahead and execute your script: + +.. code-block:: sh + + run .py + +Code execution will halt at the breakpoint defined or at the occurance of any segfault. LLDB's `GDB to LLDB command map `_ provides a listing of debugger command that you can execute using either debugger. + +Another option to execute the entire test suite under lldb would be to run the following: + +.. code-block:: sh + + lldb -- python -m pytest + +Or for gdb + +.. code-block:: sh + + gdb --args python -m pytest + +Once the process launches, simply type ``run`` and the test suite will begin, stopping at any segmentation fault that may occur. + +Checking memory leaks with valgrind +=================================== + +You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: + +.. code-block:: sh + + PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest + +Note that code execution under valgrind will take much longer than usual. While you can run valgrind against extensions compiled with any optimization level, it is suggested to have optimizations turned off from compiled extensions to reduce the amount of false positives. The ``--with-debugging-symbols`` flag passed during package setup will do this for you automatically. + +.. note:: + + For best results, you should run use a Python installation configured with Valgrind support (--with-valgrind) diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index e842c827b417f..abe2fc1409bfb 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -17,6 +17,7 @@ Development maintaining internals test_writing + debugging_extensions extending developer policies diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c823ad01f10bf..9c070efa694d4 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -263,12 +263,12 @@ Jinja2 2.10 Conditional formatting with DataFra PyQt4 Clipboard I/O PyQt5 Clipboard I/O PyTables 3.5.1 HDF5-based reading / writing -SQLAlchemy 1.2.8 SQL support for databases other than sqlite +SQLAlchemy 1.3.0 SQL support for databases other than sqlite SciPy 1.12.0 Miscellaneous statistical functions xlsxwriter 1.0.2 Excel writing -blosc 1.15.0 Compression for HDF5 +blosc 1.17.0 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP -fastparquet 0.3.2 Parquet reading / writing +fastparquet 0.4.0 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) lxml 4.3.0 HTML parser for read_html (see :ref:`note `) @@ -278,7 +278,7 @@ openpyxl 2.6.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing -pymysql 0.7.11 MySQL engine for sqlalchemy +pymysql 0.8.1 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index ffecaa222e1f9..8d38c12252df4 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2229,7 +2229,7 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python dft1 = pd.DataFrame({"a": [1, 0, 1], "b": [4, 5, 6], "c": [7, 8, 9]}) - dft1 = dft1.astype({"a": np.bool, "c": np.float64}) + dft1 = dft1.astype({"a": np.bool_, "c": np.float64}) dft1 dft1.dtypes diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 5a6f56388dee5..77791b4b7e491 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1406,7 +1406,7 @@ Often it's useful to obtain the lower (or upper) triangular form of a correlatio df = pd.DataFrame(np.random.random(size=(100, 5))) corr_mat = df.corr() - mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1) + mask = np.tril(np.ones_like(corr_mat, dtype=np.bool_), k=-1) corr_mat.where(mask) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 42621c032416d..b474989f43731 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -199,8 +199,8 @@ in Python, so maybe we could minimize these by cythonizing the apply part. ...: return s * dx ...: cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, ...: np.ndarray col_N): - ...: assert (col_a.dtype == np.float - ...: and col_b.dtype == np.float and col_N.dtype == np.int) + ...: assert (col_a.dtype == np.float_ + ...: and col_b.dtype == np.float_ and col_N.dtype == np.int_) ...: cdef Py_ssize_t i, n = len(col_N) ...: assert (len(col_a) == len(col_b) == n) ...: cdef np.ndarray[double] res = np.empty(n) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 817ea3445f995..0a11344d575f1 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -380,6 +380,8 @@ NA values in a boolean array propagate as ``False``: .. versionchanged:: 1.0.2 +.. ipython:: python + mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean") mask df1[mask] diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 965833c013c03..b6d686ee2551f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2430,16 +2430,14 @@ Read a URL with no options: .. ipython:: python - url = "https://www.fdic.gov/bank/individual/failed/banklist.html" + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/html/spam.html" + ) dfs = pd.read_html(url) dfs -.. note:: - - The data from the above URL changes every Monday so the resulting data above - and the data below may be slightly different. - -Read in the content of the file from the above URL and pass it to ``read_html`` +Read in the content of the "banklist.html" file and pass it to ``read_html`` as a string: .. ipython:: python @@ -2820,15 +2818,40 @@ parse HTML tables in the top-level pandas io function ``read_html``. Excel files ----------- -The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) -files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files -can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``) +The :func:`~pandas.read_excel` method can read Excel 2007+ (``.xlsx``) files +using the ``openpyxl`` Python module. Excel 2003 (``.xls``) files +can be read using ``xlrd``. Binary Excel (``.xlsb``) files can be read using ``pyxlsb``. The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. +.. warning:: + + The `xlwt `__ package for writing old-style ``.xls`` + excel files is no longer maintained. + The `xlrd `__ package is now only for reading + old-style ``.xls`` files. + + Before pandas 1.2.0, the default argument ``engine=None`` to :func:`~pandas.read_excel` + would result in using the ``xlrd`` engine in many cases, including new + Excel 2007+ (``.xlsx``) files. + If `openpyxl `__ is installed, + many of these cases will now default to using the ``openpyxl`` engine. + See the :func:`read_excel` documentation for more details. + + Thus, it is strongly encouraged to install ``openpyxl`` to read Excel 2007+ + (``.xlsx``) files. + **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** + This is no longer supported, switch to using ``openpyxl`` instead. + + Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` + unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. + While this option is now deprecated and will also raise a ``FutureWarning``, + it can be globally set and the warning suppressed. Users are recommended to + write ``.xlsx`` files using the ``openpyxl`` engine instead. + .. _io.excel_reader: Reading Excel files diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 310857faec436..c37255c765171 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,8 @@ Version 1.2 .. toctree:: :maxdepth: 2 + v1.2.2 + v1.2.1 v1.2.0 Version 1.1 diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e054ac830ce41..64552b104c053 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -716,6 +716,19 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) +.. _whatsnew_110.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_110.api_breaking.testing.check_freq: + +Added ``check_freq`` argument to ``testing.assert_frame_equal`` and ``testing.assert_series_equal`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``check_freq`` argument was added to :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` in pandas 1.1.0 and defaults to ``True``. :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` now raise ``AssertionError`` if the indexes do not have the same frequency. Before pandas 1.1.0, the index frequency was not checked. + + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index acf1b3bce8113..8a935f269e1a6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_120: -What's new in 1.2.0 (??) ------------------------- +What's new in 1.2.0 (December 26, 2020) +--------------------------------------- These are the changes in pandas 1.2.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -10,23 +10,24 @@ including other versions of pandas. .. warning:: - The packages `xlrd `_ for reading excel - files and `xlwt `_ for - writing excel files are no longer maintained. These are the only engines in pandas - that support the xls format. + The `xlwt `_ package for writing old-style ``.xls`` + excel files is no longer maintained. + The `xlrd `_ package is now only for reading + old-style ``.xls`` files. - Previously, the default argument ``engine=None`` to ``pd.read_excel`` - would result in using the ``xlrd`` engine in many cases. If - `openpyxl `_ is installed, + Previously, the default argument ``engine=None`` to :func:`~pandas.read_excel` + would result in using the ``xlrd`` engine in many cases, including new + Excel 2007+ (``.xlsx``) files. + If `openpyxl `_ is installed, many of these cases will now default to using the ``openpyxl`` engine. - See the :func:`read_excel` documentation for more details. Attempting to read - ``.xls`` files or specifying ``engine="xlrd"`` to ``pd.read_excel`` will not - raise a warning. However users should be aware that ``xlrd`` is already - broken with certain package configurations, for example with Python 3.9 - when `defusedxml `_ is installed, and - is anticipated to be unusable in the future. - - Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` + See the :func:`read_excel` documentation for more details. + + Thus, it is strongly encouraged to install ``openpyxl`` to read Excel 2007+ + (``.xlsx``) files. + **Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.** + This is no longer supported, switch to using ``openpyxl`` instead. + + Attempting to use the ``xlwt`` engine will raise a ``FutureWarning`` unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. While this option is now deprecated and will also raise a ``FutureWarning``, it can be globally set and the warning suppressed. Users are recommended to @@ -188,16 +189,16 @@ These are extension data types dedicated to floating point data that can hold th ``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). While the default float data type already supports missing values using ``np.nan``, -these new data types use ``pd.NA`` (and its corresponding behaviour) as the missing +these new data types use ``pd.NA`` (and its corresponding behavior) as the missing value indicator, in line with the already existing nullable :ref:`integer ` and :ref:`boolean ` data types. -One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is +One example where the behavior of ``np.nan`` and ``pd.NA`` is different is comparison operations: .. ipython:: python - # the default numpy float64 dtype + # the default NumPy float64 dtype s1 = pd.Series([1.5, None]) s1 s1 > 1 @@ -209,7 +210,7 @@ comparison operations: s2 s2 > 1 -See the :ref:`missing_data.NA` doc section for more details on the behaviour +See the :ref:`missing_data.NA` doc section for more details on the behavior when using the ``pd.NA`` missing value indicator. As shown above, the dtype can be specified using the "Float64" or "Float32" @@ -226,7 +227,7 @@ give float results will now also use the nullable floating data types (:issue:`3 .. warning:: Experimental: the new floating data types are currently experimental, and their - behaviour or API may still change without warning. Especially the behaviour + behavior or API may still change without warning. Especially the behavior regarding NaN (distinct from NA missing values) is subject to change. .. _whatsnew_120.index_name_preservation: @@ -251,7 +252,7 @@ level-by-level basis. .. _whatsnew_120.groupby_ewm: -Groupby supports EWM operations directly +GroupBy supports EWM operations directly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`.DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`). @@ -281,16 +282,15 @@ Other enhancements - :meth:`.Styler.set_table_styles` now allows the direct styling of rows and columns and can be chained (:issue:`35607`) - :class:`.Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`.Rolling.mean` and :meth:`.Rolling.sum` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) -- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetime-like dtypes will now try to cast string arguments (list-like and scalar) to the matching datetime-like type (:issue:`36346`) - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) -- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`) - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). + This change has been reverted in pandas 1.2.1, and the behaviour to not align DataFrames + is deprecated instead, see the :ref:`the 1.2.1 release notes `. - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) -- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use - nullable dtypes that use ``pd.NA`` as missing value indicator where possible - for the resulting DataFrame (default is False, and only applicable for - ``engine="pyarrow"``) (:issue:`31242`) +- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use nullable dtypes that use ``pd.NA`` as missing value indicator where possible for the resulting DataFrame (default is ``False``, and only applicable for ``engine="pyarrow"``) (:issue:`31242`) - Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) - :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) @@ -307,7 +307,8 @@ Other enhancements - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) -- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) +- When :func:`read_csv`, :func:`read_sas` and :func:`read_json` are called with ``chunksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) +- Augmented the list of named colors available for styling Excel exports, enabling all of CSS4 colors (:issue:`38247`) .. --------------------------------------------------------------------------- @@ -482,7 +483,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | sqlalchemy | 1.2.8 | X | +-----------------+-----------------+---------+ -| xarray | 0.12.0 | X | +| xarray | 0.12.3 | X | +-----------------+-----------------+---------+ | xlrd | 1.2.0 | X | +-----------------+-----------------+---------+ @@ -495,12 +496,12 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -.. _whatsnew_200.api.other: +.. _whatsnew_120.api.other: Other API changes ^^^^^^^^^^^^^^^^^ -- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) +- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for Datetime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for Datetime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses (:issue:`35992`) - Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`.DatetimeArray.take`, :meth:`TimedeltaArray.take`, or :meth:`PeriodArray.take` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) - Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) @@ -517,18 +518,15 @@ Deprecations - Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) -- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) +- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`35224`) - The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) -- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` - (given the ambiguity whether it is indexing the rows or selecting a column), use - ``df.loc[string]`` instead (:issue:`36179`) -- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`.DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) +- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` (given the ambiguity whether it is indexing the rows or selecting a column), use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) -- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) +- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set (:issue:`24804`) - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) - Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) -- Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) +- Deprecated slice-indexing on tz-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) - :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) - Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) - :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) @@ -540,6 +538,14 @@ Deprecations - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) +**Calling NumPy ufuncs on non-aligned DataFrames** + +Calling NumPy ufuncs on non-aligned DataFrames changed behaviour in pandas +1.2.0 (to align the inputs before calling the ufunc), but this change is +reverted in pandas 1.2.1. The behaviour to not align is now deprecated instead, +see the :ref:`the 1.2.1 release notes ` for +more details. + .. --------------------------------------------------------------------------- @@ -555,8 +561,7 @@ Performance improvements - :class:`.Styler` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :func:`to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on an :class:`IntervalArray` (:issue:`36310`) -- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, - avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) +- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) - Performance improvement in :meth:`.RollingGroupby.count` (:issue:`35625`) - Small performance decrease to :meth:`.Rolling.min` and :meth:`.Rolling.max` for fixed windows (:issue:`36567`) - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) @@ -566,6 +571,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) - Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) - Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) +- :meth:`read_json` now avoids reading entire file into memory when chunksize is specified (:issue:`34548`) .. --------------------------------------------------------------------------- @@ -580,30 +586,30 @@ Categorical - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) -- Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with timezone-aware ``datetime64`` categories incorrectly dropping the timezone information instead of casting to object dtype (:issue:`38136`) +- Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with tz-aware ``datetime64`` categories incorrectly dropping the time zone information instead of casting to object dtype (:issue:`38136`) -Datetimelike -^^^^^^^^^^^^ +Datetime-like +^^^^^^^^^^^^^ - Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) - Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) -- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) +- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g. months=12) (:issue:`34511`) - Bug in :meth:`.DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`.DatetimeIndex` (:issue:`35690`) - Bug in :meth:`.DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) - Bug in :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or :class:`Period` dtype placement of ``NaT`` values being inconsistent with NumPy (:issue:`36176`, :issue:`36254`) -- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) -- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) +- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetime-like scalars but not scalar strings (:issue:`36261`) +- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched time zone (:issue:`37356`) - Bug in :class:`.DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) -- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) +- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between tz-aware and tz-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`.DatetimeIndex.equals` and :meth:`.TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) -- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement timezone parsing when orient structure is ``table`` (:issue:`35973`) -- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred timezone from string (:issue:`35973`) +- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement time zone parsing when orient structure is ``table`` (:issue:`35973`) +- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred time zone from string (:issue:`35973`) - Bug in :meth:`.TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) -- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) +- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched time zone (:issue:`37299`) - Bug in adding a :class:`.BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) - Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) - Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) -- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider tz-aware and tz-naive datetimes as always different (:issue:`35728`) - Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) - Bug in :class:`Period` constructor now correctly handles nanoseconds in the ``value`` argument (:issue:`34621` and :issue:`17053`) @@ -617,7 +623,7 @@ Timedelta Timezones ^^^^^^^^^ -- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with ``ambiguous=False`` (:issue:`35297`) +- Bug in :func:`date_range` was raising ``AmbiguousTimeError`` for valid input with ``ambiguous=False`` (:issue:`35297`) - Bug in :meth:`Timestamp.replace` was losing fold information (:issue:`37610`) @@ -625,8 +631,8 @@ Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) -- Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) -- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) +- Bug in :meth:`Series.equals` where a ``ValueError`` was raised when NumPy arrays were compared to scalars (:issue:`35267`) +- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different time zones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Bug in :mod:`pandas.testing` module functions when used with ``check_exact=False`` on complex numeric types (:issue:`28235`) - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) @@ -643,15 +649,13 @@ Numeric Conversion ^^^^^^^^^^ -- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`) +- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetime-like columns (:issue:`21256`) - Bug in :meth:`Series.astype` conversion from ``string`` to ``float`` raised in presence of ``pd.NA`` values (:issue:`37626`) -- Strings ^^^^^^^ - Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) - Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype Series containing only numeric strings and ``NA`` (:issue:`37262`) -- Interval ^^^^^^^^ @@ -660,15 +664,14 @@ Interval - Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`) - Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`) - Bug in :meth:`IntervalArray.astype` incorrectly dropping dtype information with a :class:`CategoricalDtype` object (:issue:`37984`) -- Indexing ^^^^^^^^ - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) -- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) -- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`) -- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order (:issue:`35584`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp`` (:issue:`36359`) +- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result (:issue:`32334`) - Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) - Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) @@ -679,11 +682,11 @@ Indexing - Bug in :meth:`DataFrame.loc` returning empty result when indexer is a slice with negative step size (:issue:`38071`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`) - Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a list-like indexer containing NA values (:issue:`37722`) - Bug in :meth:`DataFrame.loc.__setitem__` expanding an empty :class:`DataFrame` with mixed dtypes (:issue:`37932`) - Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) -- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`) -- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) +- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not ``None`` or ``method="nearest"`` (:issue:`27315`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using list-like indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) - Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`) - Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`) - Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) @@ -694,8 +697,8 @@ Indexing - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) - Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) - Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`) -- Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a listlike ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`) -- Bug in :meth:`Series.loc` and :meth:`Series.iloc` setting all the values of an ``object`` Series with those of a listlike ``ExtensionArray`` instead of inserting it (:issue:`38271`) +- Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a list-like ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`) +- Bug in :meth:`Series.loc` and :meth:`Series.iloc` setting all the values of an ``object`` Series with those of a list-like ``ExtensionArray`` instead of inserting it (:issue:`38271`) Missing ^^^^^^^ @@ -703,7 +706,6 @@ Missing - Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) - Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`) -- MultiIndex ^^^^^^^^^^ @@ -731,20 +733,23 @@ I/O - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) - Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) - Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when used with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) -- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) +- Bug in :meth:`read_parquet` with fixed offset time zones. String representation of time zones was not recognized (:issue:`35997`, :issue:`36004`) - Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`) - Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) - Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty DataFrame with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) -- Bug in :class:`HDFStore` was dropping timezone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` was dropping time zone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) - Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) - :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) - Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) - Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) -- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) +- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other ``read_*`` functions (:issue:`37909`) - :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) -- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) +- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`, :issue:`37983`) +- :meth:`DataFrame.to_csv` was re-opening file-like handles that also implement ``os.PathLike`` (:issue:`38125`) +- Bug in the conversion of a sliced ``pyarrow.Table`` with missing values to a DataFrame (:issue:`38525`) +- Bug in :func:`read_sql_table` raising a ``sqlalchemy.exc.OperationalError`` when column names contained a percentage sign (:issue:`37517`) Period ^^^^^^ @@ -756,14 +761,18 @@ Plotting - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`) -- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`) +- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`). This fix has been reverted in pandas 1.2.1, see :doc:`v1.2.1` - Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) - Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`) - Bug in :meth:`.DataFrameGroupBy.boxplot` when ``subplots=False`` would raise a ``KeyError`` (:issue:`16748`) -- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no ``sharey`` parameter was passed (:issue:`37942`) +- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behavior when no ``sharey`` parameter was passed (:issue:`37942`) - Bug in :meth:`DataFrame.plot` was raising a ``TypeError`` with ``ExtensionDtype`` columns (:issue:`32073`) +Styler +^^^^^^ + +- Bug in :meth:`Styler.render` HTML was generated incorrectly because of formatting error in ``rowspan`` attribute, it now matches with w3 syntax (:issue:`38234`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -773,7 +782,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) - Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) -- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) +- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`) - Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) @@ -783,14 +792,15 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) - Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) - Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`) -- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with time aware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) - Bug in :meth:`.Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) - Using :meth:`.Rolling.var` instead of :meth:`.Rolling.std` avoids numerical issues for :meth:`.Rolling.corr` when :meth:`.Rolling.var` is still within floating point precision while :meth:`.Rolling.std` is not (:issue:`31286`) - Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.Resampler.quantile` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) - Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` returned wrong values for :class:`.BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) -- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) +- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`.DataFrameGroupBy.resample` using ``.agg`` with sum produced different result than just calling ``.sum`` (:issue:`33548`) - Bug in :meth:`.DataFrameGroupBy.apply` dropped values on ``nan`` group when returning the same axes with the original frame (:issue:`38227`) - Bug in :meth:`.DataFrameGroupBy.quantile` couldn't handle with arraylike ``q`` when grouping by columns (:issue:`33795`) - Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`) @@ -803,7 +813,7 @@ Reshaping - Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Bug in :meth:`DataFrame.stack` where an empty DataFrame.stack would raise an error (:issue:`36113`). Now returning an empty Series with empty MultiIndex. -- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ValueError. (:issue:`36113`) +- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ``ValueError`` (:issue:`36113`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was a dictionary (:issue:`35811`) - Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns are both multiindexed (:issue:`36360`) @@ -812,25 +822,18 @@ Reshaping - Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) - Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`) - Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) -- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) - Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) - Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) - Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) - Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`) - Bug in :func:`DataFrame.drop_duplicates` not validating bool dtype for ``ignore_index`` keyword (:issue:`38274`) -Sparse -^^^^^^ - -- -- - ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :class:`DataFrame` column set to scalar extension type via a dict instantiation was considered an object type rather than the extension type (:issue:`35965`) - Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`28488`) -- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) +- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning ``None`` (:issue:`36913`) - Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) - Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) - Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) @@ -850,14 +853,14 @@ Other - Bug in :meth:`Index.difference` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38268`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) - Bug in :meth:`Index.intersection` with non-matching numeric dtypes casting to ``object`` dtype instead of minimal common dtype (:issue:`38122`) -- Bug in :meth:`IntervalIndex.intersection` returning an incorrectly-typed :class:`Index` when empty (:issue:`38282`) +- Bug in :meth:`IntervalIndex.union` returning an incorrectly-typed :class:`Index` when empty (:issue:`38282`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) - Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) - Bug in :meth:`Index.drop` raising ``InvalidIndexError`` when index has duplicates (:issue:`38051`) - Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) - Fixed bug in :func:`assert_series_equal` when comparing a datetime-like array with an equivalent non extension dtype array (:issue:`37609`) - - +- Bug in :func:`.is_bool_dtype` would raise when passed a valid string such as ``"boolean"`` (:issue:`38386`) +- Fixed regression in logical operators raising ``ValueError`` when columns of :class:`DataFrame` are a :class:`CategoricalIndex` with unused categories (:issue:`38367`) .. --------------------------------------------------------------------------- @@ -866,4 +869,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.4..v1.2.0|HEAD +.. contributors:: v1.1.5..v1.2.0 diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst new file mode 100644 index 0000000000000..8bfe233ae50cc --- /dev/null +++ b/doc/source/whatsnew/v1.2.1.rst @@ -0,0 +1,147 @@ +.. _whatsnew_121: + +What's new in 1.2.1 (January 20, 2021) +-------------------------------------- + +These are the changes in pandas 1.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`~DataFrame.to_csv` that created corrupted zip files when there were more rows than ``chunksize`` (:issue:`38714`) +- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamReaderWriter`` in binary mode instead of in text mode (:issue:`39247`) +- Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`) +- Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`) +- Fixed regression in :meth:`DataFrame.to_stata` not removing the created file when an error occured (:issue:`39202`) +- Fixed regression in ``DataFrame.__setitem__`` raising ``ValueError`` when expanding :class:`DataFrame` and new column is from type ``"0 - name"`` (:issue:`39010`) +- Fixed regression in setting with :meth:`DataFrame.loc` raising ``ValueError`` when :class:`DataFrame` has unsorted :class:`MultiIndex` columns and indexer is a scalar (:issue:`38601`) +- Fixed regression in setting with :meth:`DataFrame.loc` raising ``KeyError`` with :class:`MultiIndex` and list-like columns indexer enlarging :class:`DataFrame` (:issue:`39147`) +- Fixed regression in :meth:`~DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) +- Fixed regression in :meth:`.GroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`) +- Fixed regression in :meth:`.DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) +- Fixed regression in :meth:`DataFrame.groupby` when aggregating an ``ExtensionDType`` that could fail for non-numeric values (:issue:`38980`) +- Fixed regression in :meth:`.Rolling.skew` and :meth:`.Rolling.kurt` modifying the object inplace (:issue:`38908`) +- Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`) +- Fixed regression in :meth:`DataFrame.apply` with ``axis=1`` using str accessor in apply function (:issue:`38979`) +- Fixed regression in :meth:`DataFrame.replace` raising ``ValueError`` when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`) +- Fixed regression in :meth:`Series.fillna` that raised ``RecursionError`` with ``datetime64[ns, UTC]`` dtype (:issue:`38851`) +- Fixed regression in comparisons between ``NaT`` and ``datetime.date`` objects incorrectly returning ``True`` (:issue:`39151`) +- Fixed regression in calling NumPy :func:`~numpy.ufunc.accumulate` ufuncs on DataFrames, e.g. ``np.maximum.accumulate(df)`` (:issue:`39259`) +- Fixed regression in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) +- Fixed regression that raised ``AttributeError`` with PyArrow versions [0.16.0, 1.0.0) (:issue:`38801`) +- Fixed regression in :func:`pandas.testing.assert_frame_equal` raising ``TypeError`` with ``check_like=True`` when :class:`Index` or columns have mixed dtype (:issue:`39168`) + +We have reverted a commit that resulted in several plotting related regressions in pandas 1.2.0 (:issue:`38969`, :issue:`38736`, :issue:`38865`, :issue:`38947` and :issue:`39126`). +As a result, bugs reported as fixed in pandas 1.2.0 related to inconsistent tick labeling in bar plots are again present (:issue:`26186` and :issue:`11465`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.ufunc_deprecation: + +Calling NumPy ufuncs on non-aligned DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before pandas 1.2.0, calling a NumPy ufunc on non-aligned DataFrames (or +DataFrame / Series combination) would ignore the indices, only match +the inputs by shape, and use the index/columns of the first DataFrame for +the result: + +.. code-block:: python + + >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + >>> df1 + a b + 0 1 3 + 1 2 4 + >>> df2 + a b + 1 1 3 + 2 2 4 + + >>> np.add(df1, df2) + a b + 0 2 6 + 1 4 8 + +This contrasts with how other pandas operations work, which first align +the inputs: + +.. code-block:: python + + >>> df1 + df2 + a b + 0 NaN NaN + 1 3.0 7.0 + 2 NaN NaN + +In pandas 1.2.0, we refactored how NumPy ufuncs are called on DataFrames, and +this started to align the inputs first (:issue:`39184`), as happens in other +pandas operations and as it happens for ufuncs called on Series objects. + +For pandas 1.2.1, we restored the previous behaviour to avoid a breaking +change, but the above example of ``np.add(df1, df2)`` with non-aligned inputs +will now to raise a warning, and a future pandas 2.0 release will start +aligning the inputs first (:issue:`39184`). Calling a NumPy ufunc on Series +objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. + +To avoid the warning and keep the current behaviour of ignoring the indices, +convert one of the arguments to a NumPy array: + +.. code-block:: python + + >>> np.add(df1, np.asarray(df2)) + a b + 0 2 6 + 1 4 8 + +To obtain the future behaviour and silence the warning, you can align manually +before passing the arguments to the ufunc: + +.. code-block:: python + + >>> df1, df2 = df1.align(df2) + >>> np.add(df1, df2) + a b + 0 NaN NaN + 1 3.0 7.0 + 2 NaN NaN + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`) +- Bug in :func:`read_csv` not closing an opened file handle when a ``csv.Error`` or ``UnicodeDecodeError`` occurred while initializing (:issue:`39024`) +- Bug in :func:`pandas.testing.assert_index_equal` raising ``TypeError`` with ``check_order=False`` when :class:`Index` has mixed dtype (:issue:`39168`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.other: + +Other +~~~~~ + +- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) +- Bumped minimum fastparquet version to 0.4.0 to avoid ``AttributeError`` from numba (:issue:`38344`) +- Bumped minimum pymysql version to 0.8.1 to avoid test failures (:issue:`38344`) +- Fixed build failure on MacOS 11 in Python 3.9.1 (:issue:`38766`) +- Added reference to backwards incompatible ``check_freq`` arg of :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` in :ref:`pandas 1.1.0 whats new ` (:issue:`34050`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.0..v1.2.1 diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst new file mode 100644 index 0000000000000..240acf787f9c9 --- /dev/null +++ b/doc/source/whatsnew/v1.2.2.rst @@ -0,0 +1,52 @@ +.. _whatsnew_122: + +What's new in 1.2.2 (February ??, 2021) +--------------------------------------- + +These are the changes in pandas 1.2.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :func:`read_excel` that caused it to raise ``AttributeError`` when checking version of older xlrd versions (:issue:`38955`) +- Fixed regression in :class:`DataFrame` constructor reordering element when construction from datetime ndarray with dtype not ``"datetime64[ns]"`` (:issue:`39422`) +- Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) +- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.other: + +Other +~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_122.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.1..v1.2.2|HEAD diff --git a/environment.yml b/environment.yml index b99b856187fb6..71d7e47894f9d 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: # required - - numpy>=1.16.5 + - numpy>=1.16.5, <1.20 # gh-39513 - python=3 - python-dateutil>=2.7.3 - pytz @@ -68,7 +68,7 @@ dependencies: # unused (required indirectly may be?) - ipywidgets - - nbformat + - nbformat=5.0.8 - notebook>=5.7.5 - pip diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 734b3d5c09cbf..4cddd49381a83 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -824,7 +824,7 @@ def rank_1d( if values.dtype != np.object_: values = values.astype('O') else: - values = np.asarray(in_arr) + values = np.asarray(in_arr).copy() keep_na = na_option == 'keep' @@ -835,11 +835,6 @@ def rank_1d( elif rank_t is int64_t: mask = values == NPY_NAT - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - # double sort first by mask and then by values to ensure nan values are # either at the beginning or the end. mask/(~mask) controls padding at # tail or the head diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 88144330c1fe9..4ddbd6cf3ae60 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1733,7 +1733,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, // Process string of digits. num_digits = 0; n = 0; - while (isdigit_ascii(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1754,10 +1754,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; - number /= e[-308 - exponent]; - number /= e[308]; + } else { + number /= e[-308 - exponent]; + number /= e[308]; + } + } else { number /= e[-exponent]; } diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3b52b4d499694..07507d5f5d2d4 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -224,7 +224,7 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): ivalues = arr.view(np.int64).ravel("K") - result = np.empty(shape, dtype=DT64NS_DTYPE) + result = np.empty_like(arr, dtype=DT64NS_DTYPE) iresult = result.ravel("K").view(np.int64) if len(iresult) == 0: diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 561143f48e0ec..3a61de62daf39 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,4 +1,7 @@ +import warnings + from cpython.datetime cimport ( + PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, @@ -125,6 +128,21 @@ cdef class _NaT(datetime): return NotImplemented return result + elif PyDate_Check(other): + # GH#39151 don't defer to datetime.date object + if op == Py_EQ: + return False + if op == Py_NE: + return True + warnings.warn( + "Comparison of NaT with datetime.date is deprecated in " + "order to match the standard library behavior. " + "In a future version these will be considered non-comparable.", + FutureWarning, + stacklevel=1, + ) + return False + return NotImplemented def __add__(self, other): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 54a09a6d2ede7..882674a5c5c92 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -523,7 +523,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e - ndarray[float64_t] output, mean_array + ndarray[float64_t] output, mean_array, values_copy bint is_monotonic_increasing_bounds minp = max(minp, 3) @@ -532,10 +532,11 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ) output = np.empty(N, dtype=float) min_val = np.nanmin(values) + values_copy = np.copy(values) with nogil: for i in range(0, N): - val = values[i] + val = values_copy[i] if notnan(val): nobs_mean += 1 sum_val += val @@ -544,7 +545,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if min_val - mean_val > -1e5: mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_val + values_copy[i] = values_copy[i] - mean_val for i in range(0, N): @@ -556,7 +557,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - val = values[j] + val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add) @@ -566,13 +567,13 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - val = values[j] + val = values_copy[j] remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, &compensation_xx_remove, &compensation_xxx_remove) # calculate adds for j in range(end[i - 1], e): - val = values[j] + val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add) @@ -703,7 +704,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 - ndarray[float64_t] output + ndarray[float64_t] output, values_copy bint is_monotonic_increasing_bounds minp = max(minp, 4) @@ -711,11 +712,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + values_copy = np.copy(values) min_val = np.nanmin(values) with nogil: for i in range(0, N): - val = values[i] + val = values_copy[i] if notnan(val): nobs_mean += 1 sum_val += val @@ -724,7 +726,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if min_val - mean_val > -1e4: mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_val + values_copy[i] = values_copy[i] - mean_val for i in range(0, N): @@ -736,7 +738,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add, &compensation_xxxx_add) @@ -746,13 +748,13 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_remove, &compensation_xx_remove, &compensation_xxx_remove, &compensation_xxxx_remove) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add, &compensation_xxxx_add) diff --git a/pandas/_testing.py b/pandas/_testing.py index 469f5e1bed6ba..1df3351a7241c 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -6,11 +6,13 @@ import gzip import operator import os +from pathlib import Path +import random import re from shutil import rmtree import string import tempfile -from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast +from typing import IO, Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -57,7 +59,7 @@ Series, bdate_range, ) -from pandas.core.algorithms import take_1d +from pandas.core.algorithms import safe_sort, take_1d from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -108,6 +110,8 @@ + BYTES_DTYPES ) +NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA] + # set testing_mode _testing_mode_warnings = (DeprecationWarning, ResourceWarning) @@ -576,66 +580,48 @@ def close(fignum=None): @contextmanager -def ensure_clean(filename=None, return_filelike=False, **kwargs): +def ensure_clean(filename=None, return_filelike: bool = False, **kwargs: Any): """ Gets a temporary path and agrees to remove on close. + This implementation does not use tempfile.mkstemp to avoid having a file handle. + If the code using the returned path wants to delete the file itself, windows + requires that no program has a file handle to it. + Parameters ---------- filename : str (optional) - if None, creates a temporary file which is then removed when out of - scope. if passed, creates temporary file with filename as ending. + suffix of the created file. return_filelike : bool (default False) if True, returns a file-like which is *always* cleaned. Necessary for savefig and other functions which want to append extensions. **kwargs - Additional keywords passed in for creating a temporary file. - :meth:`tempFile.TemporaryFile` is used when `return_filelike` is ``True``. - :meth:`tempfile.mkstemp` is used when `return_filelike` is ``False``. - Note that the `filename` parameter will be passed in as the `suffix` - argument to either function. + Additional keywords are passed to open(). - See Also - -------- - tempfile.TemporaryFile - tempfile.mkstemp """ - filename = filename or "" - fd = None - - kwargs["suffix"] = filename + folder = Path(tempfile.gettempdir()) - if return_filelike: - f = tempfile.TemporaryFile(**kwargs) - - try: - yield f - finally: - f.close() - else: - # Don't generate tempfile if using a path with directory specified. - if len(os.path.dirname(filename)): - raise ValueError("Can't pass a qualified name to ensure_clean()") + if filename is None: + filename = "" + filename = ( + "".join(random.choices(string.ascii_letters + string.digits, k=30)) + filename + ) + path = folder / filename - try: - fd, filename = tempfile.mkstemp(**kwargs) - except UnicodeEncodeError: - import pytest + path.touch() - pytest.skip("no unicode file names on this system") + handle_or_str: Union[str, IO] = str(path) + if return_filelike: + kwargs.setdefault("mode", "w+b") + handle_or_str = open(path, **kwargs) - try: - yield filename - finally: - try: - os.close(fd) - except OSError: - print(f"Couldn't close file descriptor: {fd} (file: {filename})") - try: - if os.path.exists(filename): - os.remove(filename) - except OSError as e: - print(f"Exception on removing file: {e}") + try: + yield handle_or_str + finally: + if not isinstance(handle_or_str, str): + handle_or_str.close() + if path.is_file(): + path.unlink() @contextmanager @@ -802,8 +788,8 @@ def _get_ilevel_values(index, level): # If order doesn't matter then sort the index entries if not check_order: - left = left.sort_values() - right = right.sort_values() + left = Index(safe_sort(left)) + right = Index(safe_sort(right)) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: @@ -1332,6 +1318,8 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + + .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. @@ -1414,14 +1402,26 @@ def assert_series_equal( assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + left_values = left._values + right_values = right._values # Only check exact if dtype is numeric - assert_numpy_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - obj=str(obj), - index_values=np.asarray(left.index), - ) + if is_extension_array_dtype(left_values) and is_extension_array_dtype( + right_values + ): + assert_extension_array_equal( + left_values, + right_values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), + ) + else: + assert_numpy_array_equal( + left_values, + right_values, + check_dtype=check_dtype, + obj=str(obj), + index_values=np.asarray(left.index), + ) elif check_datetimelike_compat and ( needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) ): @@ -1576,6 +1576,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + + .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. rtol : float, default 1e-5 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 533e67acfa2f4..4ed9df2c97fdb 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -46,7 +46,7 @@ } -def _get_version(module: types.ModuleType) -> str: +def get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) if version is None: # xlrd uses a capitalized attribute name @@ -112,7 +112,7 @@ def import_optional_dependency( minimum_version = VERSIONS.get(name) if minimum_version: - version = _get_version(module) + version = get_version(module) if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = ( diff --git a/pandas/conftest.py b/pandas/conftest.py index 2bac2ed198789..d84a72d4cc7a8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -266,7 +266,7 @@ def nselect_method(request): # ---------------------------------------------------------------- # Missing values & co. # ---------------------------------------------------------------- -@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), pd.NA], ids=str) +@pytest.fixture(params=tm.NULL_OBJECTS, ids=str) def nulls_fixture(request): """ Fixture for each null type in pandas. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 67a0e02fc2d4d..51eeabc14c4c9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -321,7 +321,8 @@ def unique(values): Hash table-based unique. Uniques are returned in order of appearance. This does NOT sort. - Significantly faster than numpy.unique. Includes NA values. + Significantly faster than numpy.unique for long enough sequences. + Includes NA values. Parameters ---------- @@ -1981,7 +1982,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): elif is_integer_dtype(dtype): # We have to cast in order to be able to hold np.nan - dtype = np.float64 + + # int8, int16 are incompatible with float64, + # see https://github.com/cython/cython/issues/2646 + if arr.dtype.name in ["int8", "int16"]: + dtype = np.float32 + else: + dtype = np.float64 orig_ndim = arr.ndim if orig_ndim == 1: @@ -2193,24 +2200,3 @@ def _sort_tuples(values: np.ndarray[tuple]): arrays, _ = to_arrays(values, None) indexer = lexsort_indexer(arrays, orders=True) return values[indexer] - - -def make_duplicates_of_left_unique_in_right( - left: np.ndarray, right: np.ndarray -) -> np.ndarray: - """ - If left has duplicates, which are also duplicated in right, this duplicated values - are dropped from right, meaning that every duplicate value from left exists only - once in right. - - Parameters - ---------- - left: ndarray - right: ndarray - - Returns - ------- - Duplicates of left are unique in right - """ - left_duplicates = unique(left[duplicated(left)]) - return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 6b28f8f135769..cb185dcf78f63 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -149,6 +149,85 @@ def __rpow__(self, other): return self._arith_method(other, roperator.rpow) +# ----------------------------------------------------------------------------- +# Helpers to implement __array_ufunc__ + + +def _is_aligned(frame, other): + """ + Helper to check if a DataFrame is aligned with another DataFrame or Series. + """ + from pandas import DataFrame + + if isinstance(other, DataFrame): + return frame._indexed_same(other) + else: + # Series -> match index + return frame.columns.equals(other.index) + + +def _maybe_fallback(ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + In the future DataFrame, inputs to ufuncs will be aligned before applying + the ufunc, but for now we ignore the index but raise a warning if behaviour + would change in the future. + This helper detects the case where a warning is needed and then fallbacks + to applying the ufunc on arrays to avoid alignment. + + See https://github.com/pandas-dev/pandas/pull/39239 + """ + from pandas import DataFrame + from pandas.core.generic import NDFrame + + n_alignable = sum(isinstance(x, NDFrame) for x in inputs) + n_frames = sum(isinstance(x, DataFrame) for x in inputs) + + if n_alignable >= 2 and n_frames >= 1: + # if there are 2 alignable inputs (Series or DataFrame), of which at least 1 + # is a DataFrame -> we would have had no alignment before -> warn that this + # will align in the future + + # the first frame is what determines the output index/columns in pandas < 1.2 + first_frame = next(x for x in inputs if isinstance(x, DataFrame)) + + # check if the objects are aligned or not + non_aligned = sum( + not _is_aligned(first_frame, x) for x in inputs if isinstance(x, NDFrame) + ) + + # if at least one is not aligned -> warn and fallback to array behaviour + if non_aligned: + warnings.warn( + "Calling a ufunc on non-aligned DataFrames (or DataFrame/Series " + "combination). Currently, the indices are ignored and the result " + "takes the index/columns of the first DataFrame. In the future , " + "the DataFrames/Series will be aligned before applying the ufunc.\n" + "Convert one of the arguments to a NumPy array " + "(eg 'ufunc(df1, np.asarray(df2)') to keep the current behaviour, " + "or align manually (eg 'df1, df2 = df1.align(df2)') before passing to " + "the ufunc to obtain the future behaviour and silence this warning.", + FutureWarning, + stacklevel=4, + ) + + # keep the first dataframe of the inputs, other DataFrame/Series is + # converted to array for fallback behaviour + new_inputs = [] + for x in inputs: + if x is first_frame: + new_inputs.append(x) + elif isinstance(x, NDFrame): + new_inputs.append(np.asarray(x)) + else: + new_inputs.append(x) + + # call the ufunc on those transformed inputs + return getattr(ufunc, method)(*new_inputs, **kwargs) + + # signal that we didn't fallback / execute the ufunc yet + return NotImplemented + + def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): """ Compatibility with numpy ufuncs. @@ -162,6 +241,11 @@ def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any) cls = type(self) + # for backwards compatibility check and potentially fallback for non-aligned frames + result = _maybe_fallback(ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + # for binary ops, use our custom dunder methods result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: @@ -274,8 +358,14 @@ def reconstruct(result): result = getattr(ufunc, method)(*inputs, **kwargs) else: # ufunc(dataframe) - mgr = inputs[0]._mgr - result = mgr.apply(getattr(ufunc, method)) + if method == "__call__": + # for np.(..) calls + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + else: + # otherwise specific ufunc methods (eg np..accumulate(..)) + # Those can have an axis keyword and thus can't be called block-by-block + result = getattr(ufunc, method)(np.asarray(inputs[0]), **kwargs) if ufunc.nout > 1: # type: ignore[attr-defined] result = tuple(reconstruct(x) for x in result) diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index c89f5554d0715..959a13d9c107d 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -30,7 +30,7 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask] + pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset ) mask = np.asarray(mask) else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index be9864731842d..2f2f8efc0c360 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1621,6 +1621,17 @@ def floor(self, freq, ambiguous="raise", nonexistent="raise"): def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + # -------------------------------------------------------------- + # Reductions + + def any(self, *, axis: Optional[int] = None, skipna: bool = True): + # GH#34479 discussion of desired behavior long-term + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + + def all(self, *, axis: Optional[int] = None, skipna: bool = True): + # GH#34479 discussion of desired behavior long-term + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + # -------------------------------------------------------------- # Frequency Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 184fbc050036b..7d3806fe11bd2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -29,13 +29,12 @@ except ImportError: pa = None else: - # our min supported version of pyarrow, 0.15.1, does not have a compute - # module - try: + # PyArrow backed StringArrays are available starting at 1.0.0, but this + # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute + # and its compute functions existed. GH38801 + if LooseVersion(pa.__version__) >= "1.0.0": import pyarrow.compute as pc - except ImportError: - pass - else: + ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b4f6d587c6642..73cf20979a8ad 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1382,7 +1382,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: return False try: dtype = get_dtype(arr_or_dtype) - except TypeError: + except (TypeError, ValueError): return False if isinstance(arr_or_dtype, CategoricalDtype): @@ -1397,7 +1397,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: # guess this return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - return getattr(arr_or_dtype, "dtype", arr_or_dtype)._is_boolean + return getattr(dtype, "_is_boolean", False) return issubclass(dtype.type, np.bool_) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1582a57e9a71..396108bab47b7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4572,20 +4572,23 @@ def shift( if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: # We will infer fill_value to match the closest column + # Use a column that we know is valid for our column's dtype GH#38434 + label = self.columns[0] + if periods > 0: result = self.iloc[:, :-periods] for col in range(min(ncols, abs(periods))): # TODO(EA2D): doing this in a loop unnecessary with 2D EAs # Define filler inside loop so we get a copy filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, col, filler, allow_duplicates=True) + result.insert(0, label, filler, allow_duplicates=True) else: result = self.iloc[:, -periods:] for col in range(min(ncols, abs(periods))): # Define filler inside loop so we get a copy filler = self.iloc[:, -1].shift(len(self)) result.insert( - len(result.columns), col, filler, allow_duplicates=True + len(result.columns), label, filler, allow_duplicates=True ) result.columns = self.columns.copy() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b851c4d7d4931..8da3bae190f82 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -172,7 +172,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _hidden_attrs: FrozenSet[str] = frozenset(["get_values", "tshift"]) + _hidden_attrs: FrozenSet[str] = frozenset( + ["_AXIS_NAMES", "_AXIS_NUMBERS", "get_values", "tshift"] + ) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -9689,12 +9691,6 @@ def truncate( # if we have a date index, convert to dates, otherwise # treat like a slice if ax._is_all_dates: - if is_object_dtype(ax.dtype): - warnings.warn( - "Treating object-dtype Index of date objects as DatetimeIndex " - "is deprecated, will be removed in a future version.", - FutureWarning, - ) from pandas.core.tools.datetimes import to_datetime before = to_datetime(before) @@ -10893,8 +10889,10 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): # [assignment] cls.all = all # type: ignore[assignment] + # error: Argument 1 to "doc" has incompatible type "Optional[str]"; expected + # "Union[str, Callable[..., Any]]" @doc( - NDFrame.mad, + NDFrame.mad.__doc__, # type: ignore[arg-type] desc="Return the mean absolute deviation of the values " "over the requested axis.", name1=name1, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 23f0e178130be..1272ea7547209 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1606,12 +1606,11 @@ def sem(self, ddof: int = 1): if result.ndim == 1: result /= np.sqrt(self.count()) else: - cols = result.columns.get_indexer_for( - result.columns.difference(self.exclusions).unique() - ) - result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( - self.count().iloc[:, cols] - ) + cols = result.columns.difference(self.exclusions).unique() + counts = self.count() + result_ilocs = result.columns.get_indexer_for(cols) + count_ilocs = counts.columns.get_indexer_for(cols) + result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs]) return result @final diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d814a7cee436e..17584ffc5b1bf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -287,6 +287,7 @@ def __init__( self.indexer = None self.binner = None self._grouper = None + self._indexer = None self.dropna = dropna @final @@ -341,15 +342,24 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper + self._indexer = self.indexer # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined if getattr(self.grouper, "name", None) == key and isinstance(obj, Series): - # pandas\core\groupby\grouper.py:348: error: Item "None" of - # "Optional[Any]" has no attribute "take" [union-attr] - ax = self._grouper.take(obj.index) # type: ignore[union-attr] + # Sometimes self._grouper will have been resorted while + # obj has not. In this case there is a mismatch when we + # call self._grouper.take(obj.index) so we need to undo the sorting + # before we call _grouper.take. + assert self._grouper is not None + if self._indexer is not None: + reverse_indexer = self._indexer.argsort() + unsorted_ax = self._grouper.take(reverse_indexer) + ax = unsorted_ax.take(obj.index) + else: + ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") @@ -572,13 +582,8 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - # Return a dictionary of {group label: [indices belonging to the group label]} - # respecting whether sort was specified - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) - return { - category: np.flatnonzero(codes == i) - for i, category in enumerate(Index(uniques)) - } + values = Categorical(self.grouper) + return values._reverse_indexer() @property def codes(self) -> np.ndarray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7724e3930f7df..b86d54024c62d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -53,6 +53,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.generic import ABCCategoricalIndex from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms @@ -244,6 +245,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ + if len(self.groupings) == 1 and isinstance( + self.result_index, ABCCategoricalIndex + ): + # This shows unused categories in indices GH#38642 + return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) @@ -537,7 +543,9 @@ def _ea_wrap_cython_operation( result = type(orig_values)._from_sequence(res_values) return result - raise NotImplementedError(values.dtype) + raise NotImplementedError( + f"function is not implemented for this dtype: {values.dtype}" + ) @final def _cython_operation( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 46a1646727bae..11d191597d61e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4455,15 +4455,16 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False - # If other is a subclass of self and defines its own equals method, we - # dispatch to the subclass method. For instance for a MultiIndex, - # a d-level MultiIndex can equal d-tuple Index. - # Note: All EA-backed Index subclasses override equals - if ( - isinstance(other, type(self)) - and type(other) is not type(self) - and other.equals is not self.equals - ): + if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): + # if other is not object, use other's logic for coercion + return other.equals(self) + + if isinstance(other, ABCMultiIndex): + # d-level MultiIndex can equal d-tuple Index + return other.equals(self) + + if is_extension_array_dtype(other.dtype): + # All EA-backed Index subclasses override equals return other.equals(self) return array_equivalent(self._values, other._values) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a9d93f473e0e1..2ef9e4a028793 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1286,16 +1286,18 @@ def _format_native_types(self, na_rep="nan", **kwargs): # go through the levels and format them for level, level_codes in zip(self.levels, self.codes): - level = level._format_native_types(na_rep=na_rep, **kwargs) + level_strs = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any mask = level_codes == -1 if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) + nan_index = len(level_strs) + # numpy 1.21 deprecated implicit string casting + level_strs = level_strs.astype(str) + level_strs = np.append(level_strs, na_rep) assert not level_codes.flags.writeable # i.e. copy is needed level_codes = level_codes.copy() # make writeable level_codes[mask] = nan_index - new_levels.append(level) + new_levels.append(level_strs) new_codes.append(level_codes) if len(new_levels) == 1: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e7cf8cae28b88..94ddbbdf589d4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_hashable, is_integer, is_iterator, @@ -659,9 +660,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if self.ndim != 2: return - if isinstance(key, tuple) and not isinstance(self.obj.index, ABCMultiIndex): + if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc - # if index is not a MultiIndex, set key to column part + # if length of key is > 1 set key to column part key = key[column_axis] axis = column_axis @@ -1925,12 +1926,14 @@ def _ensure_iterable_column_indexer(self, column_indexer): """ Ensure that our column indexer is something that can be iterated over. """ - # Ensure we have something we can iterate over if is_integer(column_indexer): ilocs = [column_indexer] elif isinstance(column_indexer, slice): - ri = Index(range(len(self.obj.columns))) - ilocs = ri[column_indexer] + ilocs = np.arange(len(self.obj.columns))[column_indexer] + elif isinstance(column_indexer, np.ndarray) and is_bool_dtype( + column_indexer.dtype + ): + ilocs = np.arange(len(column_indexer))[column_indexer] else: ilocs = column_indexer return ilocs diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fe07823a80783..32aade97c8736 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2148,7 +2148,13 @@ def _can_hold_element(self, element: Any) -> bool: class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" - _can_hold_na = True + @property + def _holder(self): + return DatetimeArray + + @property + def fill_value(self): + return np.datetime64("NaT", "ns") def get_values(self, dtype=None): """ @@ -2216,8 +2222,10 @@ def to_native_types(self, na_rep="NaT", **kwargs): class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True - _holder = DatetimeArray - fill_value = np.datetime64("NaT", "ns") + + @property + def _can_hold_na(self): + return True def _maybe_coerce_values(self, values): """ @@ -2308,17 +2316,17 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_extension = True internal_values = Block.internal_values - - _holder = DatetimeBlock._holder _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff - fillna = DatetimeBlock.fillna # i.e. Block.fillna - fill_value = DatetimeBlock.fill_value - _can_hold_na = DatetimeBlock._can_hold_na + fill_value = np.datetime64("NaT", "ns") array_values = ExtensionBlock.array_values + @property + def _holder(self): + return DatetimeArray + def _maybe_coerce_values(self, values): """ Input validation for values passed to __init__. Ensure that @@ -2383,6 +2391,17 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + if self._can_hold_element(value): + return super().fillna(value, limit, inplace, downcast) + + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast + ) + def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") @@ -2419,9 +2438,11 @@ def _check_ndim(self, values, ndim): return ndim -class TimeDeltaBlock(DatetimeLikeBlockMixin): +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () is_timedelta = True + _can_hold_na = True + is_numeric = False fill_value = np.timedelta64("NaT", "ns") def _maybe_coerce_values(self, values): @@ -2482,7 +2503,7 @@ class ObjectBlock(Block): _can_hold_na = True def _maybe_coerce_values(self, values): - if issubclass(values.dtype.type, str): + if issubclass(values.dtype.type, (str, bytes)): values = np.array(values, dtype=object) return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 93ab207d8ce12..9aebacd740526 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1916,7 +1916,7 @@ def _consolidate(blocks): merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) - new_blocks.extend(merged_blocks) + new_blocks = extend_blocks(merged_blocks, new_blocks) return new_blocks diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index d8b5dba424cbf..7b14a5c636abe 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -309,11 +309,11 @@ def should_reindex_frame_op( if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? - cols = left.columns.intersection(right.columns) # Intersection is always unique so we have to check the unique columns left_uniques = left.columns.unique() right_uniques = right.columns.unique() + cols = left_uniques.intersection(right_uniques) if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): # TODO: is there a shortcut available when len(cols) == 0? return True diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 4a2629daf63d7..70668ac64625c 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -23,7 +23,6 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -514,14 +513,7 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - # We have to remove the duplicates from obj_labels - # in new labels to make them unique, otherwise we would - # duplicate or duplicates again - if not obj_labels.is_unique: - new_labels = algos.make_duplicates_of_left_unique_in_right( - np.asarray(obj_labels), np.asarray(new_labels) - ) - indexers[ax] = obj_labels.reindex(new_labels)[1] + indexers[ax] = obj_labels.get_indexer(new_labels) mgrs_indexers.append((obj._mgr, indexers)) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2c6cdb846221f..95fdb21824234 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -859,7 +859,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): mask_right = right_indexer == -1 if mask_left.all(): key_col = rvals - elif mask_right.all(): + elif right_indexer is not None and mask_right.all(): key_col = lvals else: key_col = Index(lvals).where(~mask_left, rvals) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e9476285c258..b4e8696ad9e13 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -430,13 +430,6 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: # need to set here because we changed the index if fastpath: self._mgr.set_axis(axis, labels) - warnings.warn( - "Automatically casting object-dtype Index of datetimes to " - "DatetimeIndex is deprecated and will be removed in a " - "future version. Explicitly cast to DatetimeIndex instead.", - FutureWarning, - stacklevel=3, - ) except (tslibs.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex @@ -4629,6 +4622,15 @@ def isin(self, values) -> "Series": 4 True 5 False Name: animal, dtype: bool + + Strings and integers are distinct and are therefore not comparable: + + >>> pd.Series([1]).isin(['1']) + 0 False + dtype: bool + >>> pd.Series([1.1]).isin(['1.1']) + 0 False + dtype: bool """ result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 2713b76189157..ca12012ec135f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -109,7 +109,7 @@ def wrapper(self, *args, **kwargs): def _map_and_wrap(name, docstring): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): - result = getattr(self._array, f"_str_{name}")() + result = getattr(self._data.array, f"_str_{name}")() return self._wrap_result(result) wrapper.__doc__ = docstring @@ -154,8 +154,7 @@ def __init__(self, data): self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) self._is_string = isinstance(data.dtype, StringDtype) - array = data.array - self._array = array + self._data = data self._index = self._name = None if isinstance(data, ABCSeries): @@ -219,7 +218,7 @@ def _validate(data): return inferred_dtype def __getitem__(self, key): - result = self._array._str_getitem(key) + result = self._data.array._str_getitem(key) return self._wrap_result(result) def __iter__(self): @@ -744,13 +743,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): - result = self._array._str_split(pat, n, expand) + result = self._data.array._str_split(pat, n, expand) return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): - result = self._array._str_rsplit(pat, n=n) + result = self._data.array._str_rsplit(pat, n=n) return self._wrap_result(result, expand=expand, returns_string=expand) _shared_docs[ @@ -846,7 +845,7 @@ def rsplit(self, pat=None, n=-1, expand=False): ) @forbid_nonstring_types(["bytes"]) def partition(self, sep=" ", expand=True): - result = self._array._str_partition(sep, expand) + result = self._data.array._str_partition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) @Appender( @@ -860,7 +859,7 @@ def partition(self, sep=" ", expand=True): ) @forbid_nonstring_types(["bytes"]) def rpartition(self, sep=" ", expand=True): - result = self._array._str_rpartition(sep, expand) + result = self._data.array._str_rpartition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) def get(self, i): @@ -914,7 +913,7 @@ def get(self, i): 5 None dtype: object """ - result = self._array._str_get(i) + result = self._data.array._str_get(i) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -980,7 +979,7 @@ def join(self, sep): 4 NaN dtype: object """ - result = self._array._str_join(sep) + result = self._data.array._str_join(sep) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1108,7 +1107,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): 4 False dtype: bool """ - result = self._array._str_contains(pat, case, flags, na, regex) + result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1140,7 +1139,7 @@ def match(self, pat, case=True, flags=0, na=None): re.match. extract : Extract matched groups. """ - result = self._array._str_match(pat, case=case, flags=flags, na=na) + result = self._data.array._str_match(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1173,7 +1172,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None): matches the regular expression. extract : Extract matched groups. """ - result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) + result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1309,7 +1308,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) warnings.warn(msg, FutureWarning, stacklevel=3) regex = True - result = self._array._str_replace( + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) return self._wrap_result(result) @@ -1355,7 +1354,7 @@ def repeat(self, repeats): 2 ccc dtype: object """ - result = self._array._str_repeat(repeats) + result = self._data.array._str_repeat(repeats) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1423,7 +1422,7 @@ def pad(self, width, side="left", fillchar=" "): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - result = self._array._str_pad(width, side=side, fillchar=fillchar) + result = self._data.array._str_pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) _shared_docs[ @@ -1597,7 +1596,7 @@ def slice(self, start=None, stop=None, step=None): 2 cm dtype: object """ - result = self._array._str_slice(start, stop, step) + result = self._data.array._str_slice(start, stop, step) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1673,7 +1672,7 @@ def slice_replace(self, start=None, stop=None, repl=None): 4 aXde dtype: object """ - result = self._array._str_slice_replace(start, stop, repl) + result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) def decode(self, encoding, errors="strict"): @@ -1699,7 +1698,7 @@ def decode(self, encoding, errors="strict"): else: decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] - arr = self._array + arr = self._data.array # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) return self._wrap_result(result) @@ -1720,7 +1719,7 @@ def encode(self, encoding, errors="strict"): ------- encoded : Series/Index of objects """ - result = self._array._str_encode(encoding, errors) + result = self._data.array._str_encode(encoding, errors) return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -1798,7 +1797,7 @@ def encode(self, encoding, errors="strict"): ) @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = self._array._str_strip(to_strip) + result = self._data.array._str_strip(to_strip) return self._wrap_result(result) @Appender( @@ -1807,7 +1806,7 @@ def strip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = self._array._str_lstrip(to_strip) + result = self._data.array._str_lstrip(to_strip) return self._wrap_result(result) @Appender( @@ -1816,7 +1815,7 @@ def lstrip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = self._array._str_rstrip(to_strip) + result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1875,7 +1874,7 @@ def wrap(self, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - result = self._array._str_wrap(width, **kwargs) + result = self._data.array._str_wrap(width, **kwargs) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1917,7 +1916,7 @@ def get_dummies(self, sep="|"): """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep) return self._wrap_result( result, name=name, @@ -1944,7 +1943,7 @@ def translate(self, table): ------- Series or Index """ - result = self._array._str_translate(table) + result = self._data.array._str_translate(table) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -2012,7 +2011,7 @@ def count(self, pat, flags=0): >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') """ - result = self._array._str_count(pat, flags) + result = self._data.array._str_count(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2069,7 +2068,7 @@ def startswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str_startswith(pat, na=na) + result = self._data.array._str_startswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2126,7 +2125,7 @@ def endswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str_endswith(pat, na=na) + result = self._data.array._str_endswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2219,7 +2218,7 @@ def findall(self, pat, flags=0): 2 [b, b] dtype: object """ - result = self._array._str_findall(pat, flags) + result = self._data.array._str_findall(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2426,7 +2425,7 @@ def find(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_find(sub, start, end) + result = self._data.array._str_find(sub, start, end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2443,7 +2442,7 @@ def rfind(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_rfind(sub, start=start, end=end) + result = self._data.array._str_rfind(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2463,7 +2462,7 @@ def normalize(self, form): ------- normalized : Series/Index of objects """ - result = self._array._str_normalize(form) + result = self._data.array._str_normalize(form) return self._wrap_result(result) _shared_docs[ @@ -2510,7 +2509,7 @@ def index(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_index(sub, start=start, end=end) + result = self._data.array._str_index(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2528,7 +2527,7 @@ def rindex(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_rindex(sub, start=start, end=end) + result = self._data.array._str_rindex(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) def len(self): @@ -2577,7 +2576,7 @@ def len(self): 5 3.0 dtype: float64 """ - result = self._array._str_len() + result = self._data.array._str_len() return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -2677,37 +2676,37 @@ def len(self): @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) @forbid_nonstring_types(["bytes"]) def lower(self): - result = self._array._str_lower() + result = self._data.array._str_lower() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) @forbid_nonstring_types(["bytes"]) def upper(self): - result = self._array._str_upper() + result = self._data.array._str_upper() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["title"]) @forbid_nonstring_types(["bytes"]) def title(self): - result = self._array._str_title() + result = self._data.array._str_title() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) @forbid_nonstring_types(["bytes"]) def capitalize(self): - result = self._array._str_capitalize() + result = self._data.array._str_capitalize() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) @forbid_nonstring_types(["bytes"]) def swapcase(self): - result = self._array._str_swapcase() + result = self._data.array._str_swapcase() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) @forbid_nonstring_types(["bytes"]) def casefold(self): - result = self._array._str_casefold() + result = self._data.array._str_casefold() return self._wrap_result(result) _shared_docs[ diff --git a/pandas/io/common.py b/pandas/io/common.py index 9fede5180e727..be353fefdd1ef 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,13 +1,14 @@ """Common IO api utilities""" import bz2 +import codecs from collections import abc import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper +from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper import mmap import os -from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union, cast from urllib.parse import ( urljoin, urlparse as parse_url, @@ -152,6 +153,7 @@ def validate_header_arg(header) -> None: def stringify_path( filepath_or_buffer: FilePathOrBuffer[AnyStr], + convert_file_like: bool = False, ) -> FileOrBuffer[AnyStr]: """ Attempt to convert a path-like object to a string. @@ -169,12 +171,15 @@ def stringify_path( Objects supporting the fspath protocol (python 3.6+) are coerced according to its __fspath__ method. - For backwards compatibility with older pythons, pathlib.Path and - py.path objects are specially coerced. - Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ + if not convert_file_like and is_file_like(filepath_or_buffer): + # GH 38125: some fsspec objects implement os.PathLike but have already opened a + # file. This prevents opening the file a second time. infer_compression calls + # this function with convert_file_like=True to infer the compression. + return cast(FileOrBuffer[AnyStr], filepath_or_buffer) + if isinstance(filepath_or_buffer, os.PathLike): filepath_or_buffer = filepath_or_buffer.__fspath__() return _expand_user(filepath_or_buffer) @@ -462,7 +467,7 @@ def infer_compression( # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings - filepath_or_buffer = stringify_path(filepath_or_buffer) + filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression return None @@ -543,8 +548,7 @@ def get_handle( Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior - if encoding is None: - encoding = "utf-8" + encoding_passed, encoding = encoding, encoding or "utf-8" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: @@ -631,6 +635,9 @@ def get_handle( # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: + if errors is None and encoding_passed is None: + # ignore errors when no encoding is specified + errors = "replace" # Encoding handle = open( handle, @@ -703,17 +710,36 @@ def __init__( archive_name: Optional[str] = None, **kwargs, ): - if mode in ["wb", "rb"]: - mode = mode.replace("b", "") + mode = mode.replace("b", "") self.archive_name = archive_name + self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): + # buffer multiple write calls, write on flush + if self.multiple_write_buffer is None: + self.multiple_write_buffer = ( + BytesIO() if isinstance(data, bytes) else StringIO() + ) + self.multiple_write_buffer.write(data) + + def flush(self) -> None: + # write to actual handle and close write buffer + if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: + return + # ZipFile needs a non-empty string archive_name = self.archive_name or self.filename or "zip" - super().writestr(archive_name, data) + with self.multiple_write_buffer: + super().writestr(archive_name, self.multiple_write_buffer.getvalue()) + + def close(self): + self.flush() + super().close() @property def closed(self): @@ -819,9 +845,15 @@ def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: """Whether the handle is opened in binary mode""" - # classes that expect bytes - binary_classes = [BufferedIOBase, RawIOBase] + # specified by user + if "t" in mode or "b" in mode: + return "b" in mode - return isinstance(handle, tuple(binary_classes)) or "b" in getattr( - handle, "mode", mode - ) + # classes that expect string but have 'b' in mode + text_classes = (codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter) + if issubclass(type(handle), text_classes): + return False + + # classes that expect bytes + binary_classes = (BufferedIOBase, RawIOBase) + return isinstance(handle, binary_classes) or "b" in getattr(handle, "mode", mode) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bf1011176693f..7d64ab77c962d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,23 +1,26 @@ import abc import datetime +from distutils.version import LooseVersion import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast +from typing import IO, Any, Dict, Mapping, Optional, Union, cast import warnings +import zipfile from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments +from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( @@ -105,28 +108,26 @@ Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : - - "xlrd" supports most old/new Excel file formats. + - "xlrd" supports old-style Excel files (.xls). - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. .. versionchanged:: 1.2.0 The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will - be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. + now only supports old-style ``.xls`` files. + When ``engine=None``, the following logic will be + used to determine the engine: + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is an xls format, + ``xlrd`` will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This + case will raise a ``ValueError`` in a future version of pandas. converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -424,6 +425,17 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): pass + def raise_if_bad_sheet_by_index(self, index: int) -> None: + n_sheets = len(self.sheet_names) + if index >= n_sheets: + raise ValueError( + f"Worksheet index {index} is invalid, {n_sheets} worksheets found" + ) + + def raise_if_bad_sheet_by_name(self, name: str) -> None: + if name not in self.sheet_names: + raise ValueError(f"Worksheet named '{name}' not found") + def parse( self, sheet_name=0, @@ -888,39 +900,92 @@ def close(self): return content -def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: +XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +ZIP_SIGNATURE = b"PK\x03\x04" +PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) + + +@doc(storage_options=_shared_docs["storage_options"]) +def inspect_excel_format( + path: Optional[str] = None, + content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, + storage_options: StorageOptions = None, +) -> str: """ - Check if the stream is an OpenDocument Spreadsheet (.ods) file + Inspect the path or content of an excel file and get its format. - It uses magic values inside the stream + At least one of path or content must be not None. If both are not None, + content will take precedence. + + Adopted from xlrd: https://github.com/python-excel/xlrd. Parameters ---------- - stream : Union[BufferedIOBase, RawIOBase] - IO stream with data which might be an ODS file + path : str, optional + Path to file to inspect. May be a URL. + content : file-like object, optional + Content of file to inspect. + {storage_options} Returns ------- - is_ods : bool - Boolean indication that this is indeed an ODS file or not + str + Format of file. + + Raises + ------ + ValueError + If resulting stream is empty. + BadZipFile + If resulting stream does not have an XLS signature and is not a valid zipfile. """ - stream.seek(0) - is_ods = False - if stream.read(4) == b"PK\003\004": - stream.seek(30) - is_ods = ( - stream.read(54) == b"mimetype" - b"application/vnd.oasis.opendocument.spreadsheet" - ) - stream.seek(0) - return is_ods + content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] + if isinstance(content, bytes): + content_or_path = BytesIO(content) + else: + content_or_path = content or path + assert content_or_path is not None + + with get_handle( + content_or_path, "rb", storage_options=storage_options, is_text=False + ) as handle: + stream = handle.handle + stream.seek(0) + buf = stream.read(PEEK_SIZE) + if buf is None: + raise ValueError("stream is empty") + else: + assert isinstance(buf, bytes) + peek = buf + stream.seek(0) + + if peek.startswith(XLS_SIGNATURE): + return "xls" + elif not peek.startswith(ZIP_SIGNATURE): + raise ValueError("File is not a recognized excel file") + + # ZipFile typing is overly-strict + # https://github.com/python/typeshed/issues/4212 + zf = zipfile.ZipFile(stream) # type: ignore[arg-type] + + # Workaround for some third party files that use forward slashes and + # lower case names. + component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] + + if "xl/workbook.xml" in component_names: + return "xlsx" + if "xl/workbook.bin" in component_names: + return "xlsb" + if "content.xml" in component_names: + return "ods" + return "zip" class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd engine by default. See read_excel for more documentation + See read_excel for more documentation. Parameters ---------- @@ -933,7 +998,7 @@ class ExcelFile: Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` Engine compatibility : - - ``xlrd`` supports most old/new Excel file formats. + - ``xlrd`` supports old-style Excel files (.xls). - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. @@ -941,21 +1006,24 @@ class ExcelFile: .. versionchanged:: 1.2.0 The engine `xlrd `_ - is no longer maintained, and is not supported with - python >= 3.9. When ``engine=None``, the following logic will be - used to determine the engine. + now only supports old-style ``.xls`` files. + When ``engine=None``, the following logic will be + used to determine the engine: - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the - extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` - will be used. + - Otherwise if ``path_or_buffer`` is an xls format, + ``xlrd`` will be used. - Otherwise if `openpyxl `_ is installed, then ``openpyxl`` will be used. + - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + This case will raise a ``ValueError`` in a future version of pandas. + + .. warning:: - Specifying ``engine="xlrd"`` will continue to be allowed for the - indefinite future. + Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. + This is not supported, switch to using ``openpyxl`` instead. """ from pandas.io.excel._odfreader import ODFReader @@ -973,33 +1041,42 @@ class ExcelFile: def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): - if engine is None: - # Determine ext and use odf for ods stream/file - if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = None - if _is_ods_stream(path_or_buffer): - engine = "odf" - else: - ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": - engine = "odf" + if engine is not None and engine not in self._engines: + raise ValueError(f"Unknown engine: {engine}") - if ( - import_optional_dependency( - "xlrd", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - from xlrd import Book + # Could be a str, ExcelFile, Book, etc. + self.io = path_or_buffer + # Always a string + self._io = stringify_path(path_or_buffer) - if isinstance(path_or_buffer, Book): - engine = "xlrd" + # Determine xlrd version if installed + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is None + ): + xlrd_version = None + else: + import xlrd - # GH 35029 - Prefer openpyxl except for xls files - if engine is None: - if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": - engine = "xlrd" - elif ( + xlrd_version = LooseVersion(get_version(xlrd)) + + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content=path_or_buffer, storage_options=storage_options + ) + + if engine is None: + if ext == "ods": + engine = "odf" + elif ext == "xls": + engine = "xlrd" + else: + # GH 35029 - Prefer openpyxl except for xls files + if ( import_optional_dependency( "openpyxl", raise_on_missing=False, on_version="ignore" ) @@ -1007,37 +1084,39 @@ def __init__( ): engine = "openpyxl" else: - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - "The xlrd engine is no longer maintained and is not " - "supported when using pandas with python >= 3.9. However, " - "the engine xlrd will continue to be allowed for the " - "indefinite future. Beginning with pandas 1.2.0, the " - "openpyxl engine will be used if it is installed and the " - "engine argument is not specified. Either install openpyxl " - "or specify engine='xlrd' to silence this warning.", - FutureWarning, - stacklevel=stacklevel, - ) engine = "xlrd" - if engine not in self._engines: - raise ValueError(f"Unknown engine: {engine}") + + if engine == "xlrd" and ext != "xls" and xlrd_version is not None: + if xlrd_version >= "2": + raise ValueError( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. Install openpyxl instead." + ) + else: + caller = inspect.stack()[1] + if ( + caller.filename.endswith( + os.path.join("pandas", "io", "excel", "_base.py") + ) + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " + f"only the xls format is supported. As a result, the " + f"openpyxl engine will be used if it is installed and the " + f"engine argument is not specified. Install " + f"openpyxl instead.", + FutureWarning, + stacklevel=stacklevel, + ) + assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options - # Could be a str, ExcelFile, Book, etc. - self.io = path_or_buffer - # Always a string - self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c5c3927216850..8987d5bb42057 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -57,12 +57,14 @@ def sheet_names(self) -> List[str]: def get_sheet_by_index(self, index: int): from odf.table import Table + self.raise_if_bad_sheet_by_index(index) tables = self.book.getElementsByType(Table) return tables[index] def get_sheet_by_name(self, name: str): from odf.table import Table + self.raise_if_bad_sheet_by_name(name) tables = self.book.getElementsByType(Table) for table in tables: diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7de958df206d5..583baf3b239d8 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -492,9 +492,11 @@ def sheet_names(self) -> List[str]: return self.book.sheetnames def get_sheet_by_name(self, name: str): + self.raise_if_bad_sheet_by_name(name) return self.book[name] def get_sheet_by_index(self, index: int): + self.raise_if_bad_sheet_by_index(index) return self.book.worksheets[index] def _convert_cell(self, cell, convert_float: bool) -> Scalar: diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index de4f7bba1a179..f77a6bd5b1ad5 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -47,9 +47,11 @@ def sheet_names(self) -> List[str]: return self.book.sheets def get_sheet_by_name(self, name: str): + self.raise_if_bad_sheet_by_name(name) return self.book.get_sheet(name) def get_sheet_by_index(self, index: int): + self.raise_if_bad_sheet_by_index(index) # pyxlsb sheets are indexed from 1 onwards # There's a fix for this in the source, but the pypi package doesn't have it return self.book.get_sheet(index + 1) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index c655db4bc772b..5eb88a694218a 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -44,9 +44,11 @@ def sheet_names(self): return self.book.sheet_names() def get_sheet_by_name(self, name): + self.raise_if_bad_sheet_by_name(name) return self.book.sheet_by_name(name) def get_sheet_by_index(self, index): + self.raise_if_bad_sheet_by_index(index) return self.book.sheet_by_index(index) def get_sheet_data(self, sheet, convert_float): diff --git a/pandas/io/formats/_color_data.py b/pandas/io/formats/_color_data.py new file mode 100644 index 0000000000000..e5b72b2befa4f --- /dev/null +++ b/pandas/io/formats/_color_data.py @@ -0,0 +1,155 @@ +# GH37967: Enable the use of CSS named colors, as defined in +# matplotlib.colors.CSS4_COLORS, when exporting to Excel. +# This data has been copied here, instead of being imported from matplotlib, +# not to have ``to_excel`` methods require matplotlib. +# source: matplotlib._color_data (3.3.3) +CSS4_COLORS = { + "aliceblue": "F0F8FF", + "antiquewhite": "FAEBD7", + "aqua": "00FFFF", + "aquamarine": "7FFFD4", + "azure": "F0FFFF", + "beige": "F5F5DC", + "bisque": "FFE4C4", + "black": "000000", + "blanchedalmond": "FFEBCD", + "blue": "0000FF", + "blueviolet": "8A2BE2", + "brown": "A52A2A", + "burlywood": "DEB887", + "cadetblue": "5F9EA0", + "chartreuse": "7FFF00", + "chocolate": "D2691E", + "coral": "FF7F50", + "cornflowerblue": "6495ED", + "cornsilk": "FFF8DC", + "crimson": "DC143C", + "cyan": "00FFFF", + "darkblue": "00008B", + "darkcyan": "008B8B", + "darkgoldenrod": "B8860B", + "darkgray": "A9A9A9", + "darkgreen": "006400", + "darkgrey": "A9A9A9", + "darkkhaki": "BDB76B", + "darkmagenta": "8B008B", + "darkolivegreen": "556B2F", + "darkorange": "FF8C00", + "darkorchid": "9932CC", + "darkred": "8B0000", + "darksalmon": "E9967A", + "darkseagreen": "8FBC8F", + "darkslateblue": "483D8B", + "darkslategray": "2F4F4F", + "darkslategrey": "2F4F4F", + "darkturquoise": "00CED1", + "darkviolet": "9400D3", + "deeppink": "FF1493", + "deepskyblue": "00BFFF", + "dimgray": "696969", + "dimgrey": "696969", + "dodgerblue": "1E90FF", + "firebrick": "B22222", + "floralwhite": "FFFAF0", + "forestgreen": "228B22", + "fuchsia": "FF00FF", + "gainsboro": "DCDCDC", + "ghostwhite": "F8F8FF", + "gold": "FFD700", + "goldenrod": "DAA520", + "gray": "808080", + "green": "008000", + "greenyellow": "ADFF2F", + "grey": "808080", + "honeydew": "F0FFF0", + "hotpink": "FF69B4", + "indianred": "CD5C5C", + "indigo": "4B0082", + "ivory": "FFFFF0", + "khaki": "F0E68C", + "lavender": "E6E6FA", + "lavenderblush": "FFF0F5", + "lawngreen": "7CFC00", + "lemonchiffon": "FFFACD", + "lightblue": "ADD8E6", + "lightcoral": "F08080", + "lightcyan": "E0FFFF", + "lightgoldenrodyellow": "FAFAD2", + "lightgray": "D3D3D3", + "lightgreen": "90EE90", + "lightgrey": "D3D3D3", + "lightpink": "FFB6C1", + "lightsalmon": "FFA07A", + "lightseagreen": "20B2AA", + "lightskyblue": "87CEFA", + "lightslategray": "778899", + "lightslategrey": "778899", + "lightsteelblue": "B0C4DE", + "lightyellow": "FFFFE0", + "lime": "00FF00", + "limegreen": "32CD32", + "linen": "FAF0E6", + "magenta": "FF00FF", + "maroon": "800000", + "mediumaquamarine": "66CDAA", + "mediumblue": "0000CD", + "mediumorchid": "BA55D3", + "mediumpurple": "9370DB", + "mediumseagreen": "3CB371", + "mediumslateblue": "7B68EE", + "mediumspringgreen": "00FA9A", + "mediumturquoise": "48D1CC", + "mediumvioletred": "C71585", + "midnightblue": "191970", + "mintcream": "F5FFFA", + "mistyrose": "FFE4E1", + "moccasin": "FFE4B5", + "navajowhite": "FFDEAD", + "navy": "000080", + "oldlace": "FDF5E6", + "olive": "808000", + "olivedrab": "6B8E23", + "orange": "FFA500", + "orangered": "FF4500", + "orchid": "DA70D6", + "palegoldenrod": "EEE8AA", + "palegreen": "98FB98", + "paleturquoise": "AFEEEE", + "palevioletred": "DB7093", + "papayawhip": "FFEFD5", + "peachpuff": "FFDAB9", + "peru": "CD853F", + "pink": "FFC0CB", + "plum": "DDA0DD", + "powderblue": "B0E0E6", + "purple": "800080", + "rebeccapurple": "663399", + "red": "FF0000", + "rosybrown": "BC8F8F", + "royalblue": "4169E1", + "saddlebrown": "8B4513", + "salmon": "FA8072", + "sandybrown": "F4A460", + "seagreen": "2E8B57", + "seashell": "FFF5EE", + "sienna": "A0522D", + "silver": "C0C0C0", + "skyblue": "87CEEB", + "slateblue": "6A5ACD", + "slategray": "708090", + "slategrey": "708090", + "snow": "FFFAFA", + "springgreen": "00FF7F", + "steelblue": "4682B4", + "tan": "D2B48C", + "teal": "008080", + "thistle": "D8BFD8", + "tomato": "FF6347", + "turquoise": "40E0D0", + "violet": "EE82EE", + "wheat": "F5DEB3", + "white": "FFFFFF", + "whitesmoke": "F5F5F5", + "yellow": "FFFF00", + "yellowgreen": "9ACD32", +} diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index be8f2de1d53fb..0cad67169feff 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -21,6 +21,7 @@ from pandas.core import generic import pandas.core.common as com +from pandas.io.formats._color_data import CSS4_COLORS from pandas.io.formats.css import CSSResolver, CSSWarning from pandas.io.formats.format import get_level_lengths from pandas.io.formats.printing import pprint_thing @@ -65,28 +66,7 @@ class CSSToExcelConverter: CSS processed by :meth:`__call__`. """ - NAMED_COLORS = { - "maroon": "800000", - "brown": "A52A2A", - "red": "FF0000", - "pink": "FFC0CB", - "orange": "FFA500", - "yellow": "FFFF00", - "olive": "808000", - "green": "008000", - "purple": "800080", - "fuchsia": "FF00FF", - "lime": "00FF00", - "teal": "008080", - "aqua": "00FFFF", - "blue": "0000FF", - "navy": "000080", - "black": "000000", - "gray": "808080", - "grey": "808080", - "silver": "C0C0C0", - "white": "FFFFFF", - } + NAMED_COLORS = CSS4_COLORS VERTICAL_MAP = { "top": "top", diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index db34b882a3c35..d0b821a3679bb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1305,7 +1305,7 @@ def _format(x): if not is_float_type[i] and leading_space: fmt_values.append(f" {_format(v)}") elif is_float_type[i]: - fmt_values.append(float_format(v)) + fmt_values.append(_trim_zeros_single_float(float_format(v))) else: if leading_space is False: # False specifically, so that the default is @@ -1315,8 +1315,6 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) - fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") - return fmt_values @@ -1832,11 +1830,25 @@ def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[s return padded +def _trim_zeros_single_float(str_float: str) -> str: + """ + Trims trailing zeros after a decimal point, + leaving just one if necessary. + """ + str_float = str_float.rstrip("0") + if str_float.endswith("."): + str_float += "0" + + return str_float + + def _trim_zeros_float( str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ - Trims zeros, leaving just one before the decimal points if need be. + Trims the maximum number of trailing zeros equally from + all numbers containing decimals, leaving just one if + necessary. """ trimmed = str_floats number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 4557c10927a15..6ed31f38893dc 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -389,7 +389,7 @@ def format_attr(pair): rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: es["attributes"] = [ - format_attr({"key": "rowspan", "value": rowspan}) + format_attr({"key": "rowspan", "value": f'"{rowspan}"'}) ] row_es.append(es) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index da085d0d0eb2f..e1ac7b1b02f21 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -630,7 +630,7 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and (not self.chunksize or not self.nrows): + if hasattr(data, "read") and not (self.chunksize or self.nrows): data = data.read() self.close() if not hasattr(data, "read") and (self.chunksize or self.nrows): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5b623c360c3ef..8ad86fd0a0dce 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -74,7 +74,7 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg +from pandas.io.common import IOHandles, get_handle, validate_header_arg from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -325,6 +325,11 @@ Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python standard encodings `_ . + .. versionchanged:: 1.2 + + When ``encoding`` is ``None``, ``errors="replace"`` is passed to + ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``. + This behavior was previously only the case for ``engine="python"``. dialect : str or csv.Dialect, optional If provided, this parameter will override values (default or not) for the following parameters: `delimiter`, `doublequote`, `escapechar`, @@ -774,7 +779,7 @@ class TextFileReader(abc.Iterator): def __init__(self, f, engine=None, **kwds): - self.f = stringify_path(f) + self.f = f if engine is not None: engine_specified = True @@ -859,14 +864,14 @@ def _get_options_with_defaults(self, engine): def _check_file_or_buffer(self, f, engine): # see gh-16530 - if is_file_like(f): + if is_file_like(f) and engine != "c" and not hasattr(f, "__next__"): # The C engine doesn't need the file-like to have the "__next__" # attribute. However, the Python engine explicitly calls # "__next__(...)" when iterating through such an object, meaning it # needs to have that attribute - if engine != "c" and not hasattr(f, "__next__"): - msg = "The 'python' engine cannot iterate through this file buffer." - raise ValueError(msg) + raise ValueError( + "The 'python' engine cannot iterate through this file buffer." + ) def _clean_options(self, options, engine): result = options.copy() @@ -1689,9 +1694,8 @@ def _convert_to_ndarrays( values, set(col_na_values) | col_na_fvalues, try_num_bool=False ) else: - is_str_or_ea_dtype = is_string_dtype( - cast_type - ) or is_extension_array_dtype(cast_type) + is_ea = is_extension_array_dtype(cast_type) + is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) @@ -1706,16 +1710,15 @@ def _convert_to_ndarrays( not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type) ): - try: - if ( - is_bool_dtype(cast_type) - and not is_categorical_dtype(cast_type) - and na_count > 0 - ): - raise ValueError(f"Bool column has NA values in column {c}") - except (AttributeError, TypeError): - # invalid input to is_bool_dtype - pass + if not is_ea and na_count > 0: + try: + if is_bool_dtype(cast_type): + raise ValueError( + f"Bool column has NA values in column {c}" + ) + except (AttributeError, TypeError): + # invalid input to is_bool_dtype + pass cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals @@ -2290,7 +2293,11 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self._open_handles(f, kwds) assert self.handles is not None assert hasattr(self.handles.handle, "readline") - self._make_reader(self.handles.handle) + try: + self._make_reader(self.handles.handle) + except (csv.Error, UnicodeDecodeError): + self.close() + raise # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index a5507259b7b6a..2dcbaf38fa51a 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -94,7 +94,19 @@ def to_pickle( is_text=False, storage_options=storage_options, ) as handles: - pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] + if handles.compression["method"] in ("bz2", "xz") and protocol >= 5: + # some weird TypeError GH#39002 with pickle 5: fallback to letting + # pickle create the entire object and then write it to the buffer. + # "zip" would also be here if pandas.io.common._BytesZipFile + # wouldn't buffer write calls + handles.handle.write( + pickle.dumps(obj, protocol=protocol) # type: ignore[arg-type] + ) + else: + # letting pickle write directly to the buffer is more memory-efficient + pickle.dump( + obj, handles.handle, protocol=protocol # type: ignore[arg-type] + ) @doc(storage_options=generic._shared_docs["storage_options"]) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5678133d5a706..02b06b164a2a1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1159,9 +1159,7 @@ def run_transaction(self): def execute(self, *args, **kwargs): """Simple passthrough to SQLAlchemy connectable""" - return self.connectable.execution_options(no_parameters=True).execute( - *args, **kwargs - ) + return self.connectable.execution_options().execute(*args, **kwargs) def read_table( self, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6f296d3c8d92f..b7fe630af90ef 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -13,7 +13,6 @@ import datetime from io import BytesIO import os -from pathlib import Path import struct import sys from typing import Any, AnyStr, Dict, List, Optional, Sequence, Tuple, Union, cast @@ -2462,8 +2461,8 @@ def write_file(self) -> None: if self.handles.compression["method"] is not None: # ZipFile creates a file (with the same name) for each write call. # Write it first into a buffer and then write the buffer to the ZipFile. - self._output_file = self.handles.handle - self.handles.handle = BytesIO() + self._output_file, self.handles.handle = self.handles.handle, BytesIO() + self.handles.created_handles.append(self.handles.handle) try: self._write_header( @@ -2484,20 +2483,21 @@ def write_file(self) -> None: self._write_value_labels() self._write_file_close_tag() self._write_map() - except Exception as exc: self._close() - if isinstance(self._fname, (str, Path)): + except Exception as exc: + self.handles.close() + if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile( + self._fname + ): try: os.unlink(self._fname) except OSError: warnings.warn( f"This save was not successful but {self._fname} could not " - "be deleted. This file is not valid.", + "be deleted. This file is not valid.", ResourceWarning, ) raise exc - else: - self._close() def _close(self) -> None: """ @@ -2509,11 +2509,8 @@ def _close(self) -> None: # write compression if self._output_file is not None: assert isinstance(self.handles.handle, BytesIO) - bio = self.handles.handle - bio.seek(0) - self.handles.handle = self._output_file - self.handles.handle.write(bio.read()) # type: ignore[arg-type] - bio.close() + bio, self.handles.handle = self.handles.handle, self._output_file + self.handles.handle.write(bio.getvalue()) # type: ignore[arg-type] def _write_map(self) -> None: """No-op, future compatibility""" diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1a22e5629ebe8..00fd0efb48530 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1370,6 +1370,7 @@ def __init__(self, data, **kwargs): self.bar_width = kwargs.pop("width", 0.5) pos = kwargs.pop("position", 0.5) kwargs.setdefault("align", "center") + self.tick_pos = np.arange(len(data)) self.bottom = kwargs.pop("bottom", 0) self.left = kwargs.pop("left", 0) @@ -1392,16 +1393,7 @@ def __init__(self, data, **kwargs): self.tickoffset = self.bar_width * pos self.lim_offset = 0 - if isinstance(self.data.index, ABCMultiIndex): - if kwargs["ax"] is not None and kwargs["ax"].has_data(): - warnings.warn( - "Redrawing a bar plot with a MultiIndex is not supported " - + "and may lead to inconsistent label positions.", - UserWarning, - ) - self.ax_index = np.arange(len(data)) - else: - self.ax_index = self.data.index + self.ax_pos = self.tick_pos - self.tickoffset def _args_adjust(self): if is_list_like(self.bottom): @@ -1428,15 +1420,6 @@ def _make_plot(self): for i, (label, y) in enumerate(self._iter_data(fillna=0)): ax = self._get_ax(i) - - if self.orientation == "vertical": - ax.xaxis.update_units(self.ax_index) - self.tick_pos = ax.convert_xunits(self.ax_index).astype(np.int) - elif self.orientation == "horizontal": - ax.yaxis.update_units(self.ax_index) - self.tick_pos = ax.convert_yunits(self.ax_index).astype(np.int) - self.ax_pos = self.tick_pos - self.tickoffset - kwds = self.kwds.copy() if self._is_series: kwds["color"] = colors @@ -1508,8 +1491,8 @@ def _post_plot_logic(self, ax: "Axes", data): str_index = [pprint_thing(key) for key in range(data.shape[0])] name = self._get_index_name() - s_edge = self.ax_pos.min() - 0.25 + self.lim_offset - e_edge = self.ax_pos.max() + 0.25 + self.bar_width + self.lim_offset + s_edge = self.ax_pos[0] - 0.25 + self.lim_offset + e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset self._decorate_ticks(ax, name, str_index, s_edge, e_edge) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index ca6fb1cf9dca0..8bb32dec2cc0e 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -52,3 +52,15 @@ def test_arrow_from_arrow_uint(): expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") tm.assert_extension_array_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="0.16.0") +def test_arrow_sliced(): + # https://github.com/pandas-dev/pandas/issues/38525 + import pyarrow as pa + + df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) + table = pa.table(df) + result = table.slice(2, None).to_pandas() + expected = df.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce6737db44195..128f505402eff 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,6 +545,7 @@ def test_is_bool_dtype(): assert not com.is_bool_dtype(pd.Series([1, 2])) assert not com.is_bool_dtype(np.array(["a", "b"])) assert not com.is_bool_dtype(pd.Index(["a", "b"])) + assert not com.is_bool_dtype("Int64") assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool_) @@ -553,6 +554,12 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(pd.BooleanDtype()) assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean")) + assert com.is_bool_dtype("boolean") + + +def test_is_bool_dtype_numpy_error(): + # GH39010 + assert not com.is_bool_dtype("0 - Name") @pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 922b3b94c16c1..b731859a761a4 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -51,8 +51,8 @@ def test_view(self, data): data.view() @pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet") - def test_contains(self, data, data_missing, nulls_fixture): - super().test_contains(data, data_missing, nulls_fixture) + def test_contains(self, data, data_missing): + super().test_contains(data, data_missing) class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 94d0ef7bbea84..c81304695f353 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -33,6 +33,22 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + expected = df.iloc[[0, 2, 4, 7]] + expected = expected.set_index("A") + + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index d7997310dde3d..6a4ff68b4580f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -29,7 +29,7 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing, nulls_fixture): + def test_contains(self, data, data_missing): # GH-37867 # Tests for membership checks. Membership checks for nan-likes is tricky and # the settled on rule is: `nan_like in arr` is True if nan_like is @@ -47,10 +47,12 @@ def test_contains(self, data, data_missing, nulls_fixture): assert na_value in data_missing assert na_value not in data - if nulls_fixture is not na_value: - # the data can never contain other nan-likes than na_value - assert nulls_fixture not in data - assert nulls_fixture not in data_missing + # the data can never contain other nan-likes than na_value + for na_value_obj in tm.NULL_OBJECTS: + if na_value_obj is na_value: + continue + assert na_value_obj not in data + assert na_value_obj not in data_missing def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 233b658d29782..08768bda312ba 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -197,6 +197,10 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): def test_groupby_apply_identity(self, data_for_grouping): super().test_groupby_apply_identity(data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestSetitem(BaseDecimal, base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a5e49796c53b..164a39498ec73 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -313,6 +313,10 @@ def test_groupby_extension_apply(self): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_error(self, data, all_arithmetic_operators): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index ced7ea9261310..86a0bc9213256 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -291,6 +291,22 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + + expected = df.iloc[[0, 2, 4]] + expected = expected.set_index("A") + + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d03a9ab6b2588..4a0fb8f81ed56 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -87,7 +87,7 @@ def test_memory_usage(self, data): # Is this deliberate? super().test_memory_usage(data) - def test_contains(self, data, data_missing, nulls_fixture): + def test_contains(self, data, data_missing): # GH-37867 # na value handling in Categorical.__contains__ is deprecated. # See base.BaseInterFaceTests.test_contains for more details. @@ -105,9 +105,11 @@ def test_contains(self, data, data_missing, nulls_fixture): assert na_value not in data # Categoricals can contain other nan-likes than na_value - if nulls_fixture is not na_value: - assert nulls_fixture not in data - assert nulls_fixture in data_missing # this line differs from super method + for na_value_obj in tm.NULL_OBJECTS: + if na_value_obj is na_value: + continue + assert na_value_obj not in data + assert na_value_obj in data_missing # this line differs from super method class TestConstructors(base.BaseConstructorsTests): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 49eb570c4ffe0..32dfe5858d1d0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1682,6 +1682,21 @@ def test_getitem_interval_index_partial_indexing(self): res = df.loc[:, 0.5] tm.assert_series_equal(res, expected) + @pytest.mark.parametrize("indexer", ["A", ["A"], ("A", slice(None))]) + def test_setitem_unsorted_multiindex_columns(self, indexer): + # GH#38601 + mi = MultiIndex.from_tuples([("A", 4), ("B", "3"), ("A", "2")]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi) + obj = df.copy() + obj.loc[:, indexer] = np.zeros((2, 2), dtype=int) + expected = DataFrame([[0, 2, 0], [0, 5, 0]], columns=mi) + tm.assert_frame_equal(obj, expected) + + df = df.sort_index(1) + df.loc[:, indexer] = np.zeros((2, 2), dtype=int) + expected = expected.sort_index(1) + tm.assert_frame_equal(df, expected) + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 884cb6c20b77e..cedef4784e4a1 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas.core.dtypes.base import registry as ea_registry from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype from pandas import ( @@ -197,6 +198,25 @@ def test_setitem_extension_types(self, obj, dtype): tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( + "ea_name", + [ + dtype.name + for dtype in ea_registry.dtypes + # property would require instantiation + if not isinstance(dtype.name, property) + ] + # mypy doesn't allow adding lists of different types + # https://github.com/python/mypy/issues/5492 + + ["datetime64[ns, UTC]", "period[D]"], # type: ignore[list-item] + ) + def test_setitem_with_ea_name(self, ea_name): + # GH 38386 + result = DataFrame([0]) + result[ea_name] = [1] + expected = DataFrame({0: [0], ea_name: [1]}) + tm.assert_frame_equal(result, expected) + def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self): # GH#7492 data_ns = np.array([1, "nat"], dtype="datetime64[ns]") @@ -336,6 +356,13 @@ def test_setitem_listlike_views(self): expected = Series([100, 2, 3], name="a") tm.assert_series_equal(ser, expected) + def test_setitem_string_column_numpy_dtype_raising(self): + # GH#39010 + df = DataFrame([[1, 2], [3, 4]]) + df["0 - Name"] = [5, 6] + expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ab750bca7e069..1b570028964df 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1636,3 +1636,10 @@ def test_replace_unicode(self): result = df1.replace(columns_values_map) expected = DataFrame({"positive": np.ones(3)}) tm.assert_frame_equal(result, expected) + + def test_replace_bytes(self, frame_or_series): + # GH#38900 + obj = frame_or_series(["o"]).astype("|S") + expected = obj.copy() + obj = obj.replace({None: np.nan}) + tm.assert_equal(obj, expected) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 2e21ce8ec2256..40b3f1e89c015 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series, date_range, offsets +from pandas import CategoricalIndex, DataFrame, Index, Series, date_range, offsets import pandas._testing as tm @@ -292,3 +292,25 @@ def test_shift_dt64values_int_fill_deprecated(self): expected = DataFrame({"A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"]}) tm.assert_frame_equal(result, expected) + + def test_shift_axis1_categorical_columns(self): + # GH#38434 + ci = CategoricalIndex(["a", "b", "c"]) + df = DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [5, 6]}, index=ci[:-1], columns=ci + ) + result = df.shift(axis=1) + + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [1, 3], "c": [2, 4]}, index=ci[:-1], columns=ci + ) + tm.assert_frame_equal(result, expected) + + # periods != 1 + result = df.shift(2, axis=1) + expected = DataFrame( + {"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 3]}, + index=ci[:-1], + columns=ci, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 157c8687808b3..a7e2fa760b7e4 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -1,7 +1,6 @@ from copy import deepcopy import inspect import pydoc -import warnings import numpy as np import pytest @@ -330,19 +329,17 @@ def test_set_flags(self, allows_duplicate_labels, frame_or_series): result.iloc[key] = 10 assert obj.iloc[key] == 0 - @skip_if_no("jinja2") def test_constructor_expanddim_lookup(self): # GH#33628 accessing _constructor_expanddim should not # raise NotImplementedError df = DataFrame() - with warnings.catch_warnings(record=True) as wrn: - # _AXIS_NUMBERS, _AXIS_NAMES lookups - inspect.getmembers(df) - - # some versions give FutureWarning, others DeprecationWarning - assert len(wrn) - assert any(x.category in [FutureWarning, DeprecationWarning] for x in wrn) - with pytest.raises(NotImplementedError, match="Not supported for DataFrames!"): df._constructor_expanddim(np.arange(27).reshape(3, 3, 3)) + + @skip_if_no("jinja2") + def test_inspect_getmembers(self): + # GH38740 + df = DataFrame() + with tm.assert_produces_warning(None): + inspect.getmembers(df) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 2300a8937991e..77287b6f1eab5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1936,6 +1936,70 @@ def test_constructor_datetimes_with_nulls(self, arr): expected = Series([np.dtype("datetime64[ns]")]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[M]", + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + def test_constructor_datetimes_non_ns(self, order, dtype): + na = np.array( + [ + ["2015-01-01", "2015-01-02", "2015-01-03"], + ["2017-01-01", "2017-01-02", "2017-02-03"], + ], + dtype=dtype, + order=order, + ) + df = DataFrame(na) + expected = DataFrame( + [ + ["2015-01-01", "2015-01-02", "2015-01-03"], + ["2017-01-01", "2017-01-02", "2017-02-03"], + ] + ) + expected = expected.astype(dtype=dtype) + tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize("order", ["K", "A", "C", "F"]) + @pytest.mark.parametrize( + "dtype", + [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ], + ) + def test_constructor_timedelta_non_ns(self, order, dtype): + na = np.array( + [ + [np.timedelta64(1, "D"), np.timedelta64(2, "D")], + [np.timedelta64(4, "D"), np.timedelta64(5, "D")], + ], + dtype=dtype, + order=order, + ) + df = DataFrame(na).astype("timedelta64[ns]") + expected = DataFrame( + [ + [Timedelta(1, "D"), Timedelta(2, "D")], + [Timedelta(4, "D"), Timedelta(5, "D")], + ], + ) + tm.assert_frame_equal(df, expected) + def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index efabc666993ee..dca12c632a418 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series +from pandas import CategoricalIndex, DataFrame, Interval, Series, isnull import pandas._testing as tm @@ -162,3 +162,24 @@ def test_logical_with_nas(self): result = d["a"].fillna(False, downcast=False) | d["b"] expected = Series([True, True]) tm.assert_series_equal(result, expected) + + def test_logical_ops_categorical_columns(self): + # GH#38367 + intervals = [Interval(1, 2), Interval(3, 4)] + data = DataFrame( + [[1, np.nan], [2, np.nan]], + columns=CategoricalIndex( + intervals, categories=intervals + [Interval(5, 6)] + ), + ) + mask = DataFrame( + [[False, False], [False, False]], columns=data.columns, dtype=bool + ) + result = mask | isnull(data) + expected = DataFrame( + [[False, True], [False, True]], + columns=CategoricalIndex( + intervals, categories=intervals + [Interval(5, 6)] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d33d91f2cefca..d843d4b0e9504 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1091,9 +1091,13 @@ def test_any_all_bool_only(self): (np.all, {"A": Series([0, 1], dtype=int)}, False), (np.any, {"A": Series([0, 1], dtype=int)}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False), + pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 81c0dc65b4e97..83fd3db72a90c 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -70,12 +72,19 @@ def test_binary_input_aligns_columns(dtype_a, dtype_b): dtype_b["C"] = dtype_b.pop("B") df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) - result = np.heaviside(df1, df2) - expected = np.heaviside( - np.array([[1, 3, np.nan], [2, 4, np.nan]]), - np.array([[1, np.nan, 3], [2, np.nan, 4]]), - ) - expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + with tm.assert_produces_warning(FutureWarning): + result = np.heaviside(df1, df2) + # Expected future behaviour: + # expected = np.heaviside( + # np.array([[1, 3, np.nan], [2, 4, np.nan]]), + # np.array([[1, np.nan, 3], [2, np.nan, 4]]), + # ) + # expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + # ensure the expected is the same when applying with numpy array + result = np.heaviside(df1, df2.values) tm.assert_frame_equal(result, expected) @@ -85,27 +94,149 @@ def test_binary_input_aligns_index(dtype): pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype) df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype) - result = np.heaviside(df1, df2) - expected = np.heaviside( - np.array([[1, 3], [3, 4], [np.nan, np.nan]]), - np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + with tm.assert_produces_warning(FutureWarning): + result = np.heaviside(df1, df2) + # Expected future behaviour: + # expected = np.heaviside( + # np.array([[1, 3], [3, 4], [np.nan, np.nan]]), + # np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + # ) + # # TODO(FloatArray): this will be Float64Dtype. + # expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) + expected = pd.DataFrame( + [[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"], index=["a", "b"] ) - # TODO(FloatArray): this will be Float64Dtype. - expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) tm.assert_frame_equal(result, expected) + # ensure the expected is the same when applying with numpy array + result = np.heaviside(df1, df2.values) + tm.assert_frame_equal(result, expected) + +@pytest.mark.filterwarnings("ignore:Calling a ufunc on non-aligned:FutureWarning") def test_binary_frame_series_raises(): # We don't currently implement df = pd.DataFrame({"A": [1, 2]}) - with pytest.raises(NotImplementedError, match="logaddexp"): + # with pytest.raises(NotImplementedError, match="logaddexp"): + with pytest.raises(ValueError, match=""): np.logaddexp(df, df["A"]) - with pytest.raises(NotImplementedError, match="logaddexp"): + # with pytest.raises(NotImplementedError, match="logaddexp"): + with pytest.raises(ValueError, match=""): np.logaddexp(df["A"], df) +def test_unary_accumulate_axis(): + # https://github.com/pandas-dev/pandas/issues/39259 + df = pd.DataFrame({"a": [1, 3, 2, 4]}) + result = np.maximum.accumulate(df) + expected = pd.DataFrame({"a": [1, 3, 3, 4]}) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": [1, 3, 2, 4], "b": [0.1, 4.0, 3.0, 2.0]}) + result = np.maximum.accumulate(df) + # in theory could preserve int dtype for default axis=0 + expected = pd.DataFrame({"a": [1.0, 3.0, 3.0, 4.0], "b": [0.1, 4.0, 4.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + result = np.maximum.accumulate(df, axis=0) + tm.assert_frame_equal(result, expected) + + result = np.maximum.accumulate(df, axis=1) + expected = pd.DataFrame({"a": [1.0, 3.0, 2.0, 4.0], "b": [1.0, 4.0, 3.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + def test_frame_outer_deprecated(): df = pd.DataFrame({"A": [1, 2]}) with tm.assert_produces_warning(FutureWarning): np.subtract.outer(df, df) + + +def test_alignment_deprecation(): + # https://github.com/pandas-dev/pandas/issues/39184 + df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) + s1 = pd.Series([1, 2], index=["a", "b"]) + s2 = pd.Series([1, 2], index=["b", "c"]) + + # binary dataframe / dataframe + expected = pd.DataFrame({"a": [2, 4, 6], "b": [8, 10, 12]}) + + with tm.assert_produces_warning(None): + # aligned -> no warning! + result = np.add(df1, df1) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + # non-aligned -> warns + result = np.add(df1, df2) + tm.assert_frame_equal(result, expected) + + result = np.add(df1, df2.values) + tm.assert_frame_equal(result, expected) + + result = np.add(df1.values, df2) + expected = pd.DataFrame({"b": [2, 4, 6], "c": [8, 10, 12]}) + tm.assert_frame_equal(result, expected) + + # binary dataframe / series + expected = pd.DataFrame({"a": [2, 3, 4], "b": [6, 7, 8]}) + + with tm.assert_produces_warning(None): + # aligned -> no warning! + result = np.add(df1, s1) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = np.add(df1, s2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = np.add(s2, df1) + tm.assert_frame_equal(result, expected) + + result = np.add(df1, s2.values) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba", "0.46.0") +def test_alignment_deprecation_many_inputs(): + # https://github.com/pandas-dev/pandas/issues/39184 + # test that the deprecation also works with > 2 inputs -> using a numba + # written ufunc for this because numpy itself doesn't have such ufuncs + from numba import float64, vectorize + + @vectorize([float64(float64, float64, float64)]) + def my_ufunc(x, y, z): + return x + y + z + + df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) + df3 = pd.DataFrame({"a": [1, 2, 3], "c": [4, 5, 6]}) + + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1, df2, df3) + expected = pd.DataFrame([[3.0, 12.0], [6.0, 15.0], [9.0, 18.0]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # all aligned -> no warning + with tm.assert_produces_warning(None): + result = my_ufunc(df1, df1, df1) + tm.assert_frame_equal(result, expected) + + # mixed frame / arrays + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1, df2, df3.values) + tm.assert_frame_equal(result, expected) + + # single frame -> no warning + with tm.assert_produces_warning(None): + result = my_ufunc(df1, df2.values, df3.values) + tm.assert_frame_equal(result, expected) + + # takes indices of first frame + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1.values, df2, df3) + expected = expected.set_axis(["b", "c"], axis=1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8cf77ca6335f4..f0bc58cbf07bf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( df_grp = df.groupby(["a", "b"], observed=observed) result = getattr(df_grp, func)() tm.assert_frame_equal(result, expected) + + +def test_groupby_categorical_indices_unused_categories(): + # GH#38642 + df = DataFrame( + { + "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]), + "col": range(3), + } + ) + grouped = df.groupby("key", sort=False) + result = grouped.indices + expected = { + "b": np.array([0, 1], dtype="int64"), + "a": np.array([2], dtype="int64"), + "c": np.array([], dtype="int64"), + } + assert result.keys() == expected.keys() + for key in result.keys(): + tm.assert_numpy_array_equal(result[key], expected[key]) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7c179a79513fa..a260aaf6e057d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -842,6 +842,14 @@ def test_omit_nuisance(df): grouped.agg(lambda x: x.sum(0, numeric_only=False)) +def test_omit_nuisance_sem(df): + # GH 38774 - sem should work with nuisance columns + grouped = df.groupby("A") + result = grouped.sem() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").sem() + tm.assert_frame_equal(result, expected) + + def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1689,64 +1697,6 @@ def test_sort(x): g.apply(test_sort) -def test_group_shift_with_null_key(): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame( - [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1) - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_with_fill_value(): - # GH #24128 - n_rows = 24 - df = DataFrame( - [(i % 12, i % 3, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1, fill_value=0)[["Z"]] - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_lose_timezone(): - # GH 30134 - now_dt = Timestamp.utcnow() - df = DataFrame({"a": [1, 1], "date": now_dt}) - result = df.groupby("a").shift(0).iloc[0] - expected = Series({"date": now_dt}, name=result.name) - tm.assert_series_equal(result, expected) - - def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py new file mode 100644 index 0000000000000..1410038274152 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest + +from pandas import DataFrame, NaT, Series, Timedelta, Timestamp +import pandas._testing as tm + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0)[["Z"]] + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = Timestamp.utcnow() + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_group_diff_real(any_real_dtype): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype) + result = df.groupby("a")["b"].diff() + exp_dtype = "float" + if any_real_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + Timestamp("2013-01-03"), + ], + [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], + ], +) +def test_group_diff_datetimelike(data): + df = DataFrame({"a": [1, 2, 2], "b": data}) + result = df.groupby("a")["b"].diff() + expected = Series([NaT, NaT, Timedelta("1 days")], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_bool(): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + result = df.groupby("a")["b"].diff() + expected = Series([np.nan, np.nan, np.nan, False, False], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_object_raises(object_dtype): + df = DataFrame( + {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype + ) + with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): + df.groupby("a")["b"].diff() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index ff1632e33c0fb..d12e9465949b4 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -204,12 +204,17 @@ def test_constructor_invalid(self): ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) - msg = ( - "String dtype not supported, " - "you may need to explicitly cast to a numeric type" + + # 2021-02-1 we get ValueError in numpy 1.20, but not on all builds + msg = "|".join( + [ + "String dtype not supported, you may need to explicitly cast ", + "could not convert string to float: 'a'", + ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises((TypeError, ValueError), match=msg): Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 42525fc575397..f381a3b205e8c 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -305,6 +305,21 @@ def test_multiindex_one_dimensional_tuple_columns(self, indexer): expected = DataFrame([0, 2], index=mi) tm.assert_frame_equal(obj, expected) + @pytest.mark.parametrize( + "indexer, exp_value", [(slice(None), 1.0), ((1, 2), np.nan)] + ) + def test_multiindex_setitem_columns_enlarging(self, indexer, exp_value): + # GH#39147 + mi = MultiIndex.from_tuples([(1, 2), (3, 4)]) + df = DataFrame([[1, 2], [3, 4]], index=mi, columns=["a", "b"]) + df.loc[indexer, ["c", "d"]] = 1.0 + expected = DataFrame( + [[1, 2, 1.0, 1.0], [3, 4, exp_value, exp_value]], + index=mi, + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "indexer, pos", diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f750b3667cec2..e8bd0cfea844d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -535,9 +535,7 @@ def test_string_slice(self): df["2011"] with pytest.raises(KeyError, match="'2011'"): - with tm.assert_produces_warning(FutureWarning): - # This does an is_all_dates check - df.loc["2011", 0] + df.loc["2011", 0] df = DataFrame() assert not df.index._is_all_dates diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 68f12a939e061..11726bc5e31c8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev import pandas.util._test_decorators as td import pandas as pd @@ -981,7 +980,6 @@ def test_loc_setitem_empty_append_single_value(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe @@ -995,7 +993,12 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "cannot copy sequence with size 2 to array axis with dimension 0" + msg = "|".join( + [ + "cannot copy sequence with size 2 to array axis with dimension 0", + r"could not broadcast input array from shape \(2,\) into shape \(0,\)", + ] + ) with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py index c5e867f45b92d..39474dedba78c 100644 --- a/pandas/tests/io/__init__.py +++ b/pandas/tests/io/__init__.py @@ -14,4 +14,8 @@ r"Use 'tree.iter\(\)' or 'list\(tree.iter\(\)\)' instead." ":PendingDeprecationWarning" ), + # GH 26552 + pytest.mark.filterwarnings( + "ignore:As the xlwt package is no longer maintained:FutureWarning" + ), ] diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index bcc666a88e3be..5d4705dbe7d77 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -50,8 +50,7 @@ def s3_base(worker_id): pytest.importorskip("s3fs") pytest.importorskip("boto3") requests = pytest.importorskip("requests") - # GH 38090: Suppress http logs in tests by moto_server - logging.getLogger("werkzeug").disabled = True + logging.getLogger("requests").disabled = True with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, @@ -71,7 +70,9 @@ def s3_base(worker_id): # pipe to null to avoid logging in terminal proc = subprocess.Popen( - shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL + shlex.split(f"moto_server s3 -p {endpoint_port}"), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) timeout = 5 diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py index 384f1006c44df..9dda54915ab1c 100644 --- a/pandas/tests/io/excel/__init__.py +++ b/pandas/tests/io/excel/__init__.py @@ -1,5 +1,9 @@ +from distutils.version import LooseVersion + import pytest +from pandas.compat._optional import get_version, import_optional_dependency + pytestmark = [ pytest.mark.filterwarnings( # Looks like tree.getiterator is deprecated in favor of tree.iter @@ -13,4 +17,19 @@ pytest.mark.filterwarnings( "ignore:As the xlwt package is no longer maintained:FutureWarning" ), + # GH 38571 + pytest.mark.filterwarnings( + "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning" + ), ] + + +if ( + import_optional_dependency("xlrd", raise_on_missing=False, on_version="ignore") + is None +): + xlrd_version = None +else: + import xlrd + + xlrd_version = LooseVersion(get_version(xlrd)) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index d6c6399f082c6..c99d9ae62bf54 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -42,5 +42,5 @@ def test_nonexistent_sheetname_raises(read_ext): # GH-27676 # Specifying a non-existent sheet_name parameter should throw an error # with the sheet name. - with pytest.raises(ValueError, match="sheet xyz not found"): + with pytest.raises(ValueError, match="Worksheet named 'xyz' not found"): pd.read_excel("blank.ods", sheet_name="xyz") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 98a55ae39bd77..9b3d359dc01a5 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ @@ -57,6 +58,13 @@ def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool: return False if read_ext == ".xlsb" and engine != "pyxlsb": return False + if ( + engine == "xlrd" + and xlrd_version is not None + and xlrd_version >= "2" + and read_ext != ".xls" + ): + return False return True @@ -614,6 +622,29 @@ def test_bad_engine_raises(self, read_ext): with pytest.raises(ValueError, match="Unknown engine: foo"): pd.read_excel("", engine=bad_engine) + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel("blank" + read_ext, sheet_name=sheet_name) + + def test_missing_file_raises(self, read_ext): + bad_file = f"foo{read_ext}" + # CI tests with zh_CN.utf8, translates to "No such file or directory" + with pytest.raises( + FileNotFoundError, match=r"(No such file or directory|没有那个文件或目录)" + ): + pd.read_excel(bad_file) + + def test_corrupt_bytes_raises(self, read_ext, engine): + bad_stream = b"foo" + with pytest.raises(ValueError, match="File is not a recognized excel file"): + pd.read_excel(bad_stream) + @tm.network def test_read_from_http_url(self, read_ext): url = ( @@ -636,6 +667,22 @@ def test_read_from_s3_url(self, read_ext, s3_resource, s3so): local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) + def test_read_from_s3_object(self, read_ext, s3_resource, s3so): + # GH 38788 + # Bucket "pandas-test" created in tests/io/conftest.py + with open("test1" + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) + + import s3fs + + s3 = s3fs.S3FileSystem(**s3so) + + with s3.open("s3://pandas-test/test1" + read_ext) as f: + url_table = pd.read_excel(f) + + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + @pytest.mark.slow def test_read_from_file_url(self, read_ext, datapath): @@ -1122,6 +1169,17 @@ def test_sheet_name(self, read_ext, df_ref): tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + @pytest.mark.parametrize( + "sheet_name", + [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]], + ) + def test_bad_sheetname_raises(self, read_ext, sheet_name): + # GH 39250 + msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found" + with pytest.raises(ValueError, match=msg): + with pd.ExcelFile("blank" + read_ext) as excel: + excel.parse(sheet_name=sheet_name) + def test_excel_read_buffer(self, engine, read_ext): pth = "test1" + read_ext expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine) @@ -1158,6 +1216,19 @@ def test_excel_read_binary(self, engine, read_ext): actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + def test_excel_read_binary_via_read_excel(self, read_ext, engine): + # GH 38424 + if read_ext == ".xlsb" and engine == "pyxlsb": + pytest.xfail("GH 38667 - should default to pyxlsb but doesn't") + with open("test1" + read_ext, "rb") as f: + result = pd.read_excel(f) + expected = pd.read_excel("test1" + read_ext, engine=engine) + tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif( + xlrd_version is not None and xlrd_version >= "2", + reason="xlrd no longer supports xlsx", + ) def test_excel_high_surrogate(self, engine): # GH 23809 expected = DataFrame(["\udc88"], columns=["Column1"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 80ebeb4c03d89..af0de05965398 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -347,19 +347,9 @@ def test_excel_sheet_by_name_raise(self, path, engine): tm.assert_frame_equal(gt, df) - if engine == "odf": - msg = "sheet 0 not found" - with pytest.raises(ValueError, match=msg): - pd.read_excel(xl, "0") - elif engine == "xlwt": - import xlrd - - msg = "No sheet named <'0'>" - with pytest.raises(xlrd.XLRDError, match=msg): - pd.read_excel(xl, sheet_name="0") - else: - with pytest.raises(KeyError, match="Worksheet 0 does not exist."): - pd.read_excel(xl, sheet_name="0") + msg = "Worksheet named '0' not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -492,7 +482,7 @@ def test_float_types(self, np_type, path): @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): - # Test np.bool values read come back as float. + # Test np.bool8 and np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(path, "test1") @@ -657,30 +647,27 @@ def test_excel_date_datetime_format(self, engine, ext, path): ) with tm.ensure_clean(ext) as filename2: - writer1 = ExcelWriter(path) - writer2 = ExcelWriter( + with ExcelWriter(path) as writer1: + df.to_excel(writer1, "test1") + + with ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", - ) - - df.to_excel(writer1, "test1") - df.to_excel(writer2, "test1") + ) as writer2: + df.to_excel(writer2, "test1") - writer1.close() - writer2.close() + with ExcelFile(path) as reader1: + rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) - reader1 = ExcelFile(path) - reader2 = ExcelFile(filename2) + with ExcelFile(filename2) as reader2: + rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) - rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) - rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + tm.assert_frame_equal(rs1, rs2) - tm.assert_frame_equal(rs1, rs2) - - # Since the reader returns a datetime object for dates, - # we need to use df_expected to check the result. - tm.assert_frame_equal(rs2, df_expected) + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. + tm.assert_frame_equal(rs2, df_expected) def test_to_excel_interval_no_labels(self, path): # see gh-19242 @@ -862,7 +849,7 @@ def test_to_excel_unicode_filename(self, ext, path): f = open(filename, "wb") except UnicodeEncodeError: pytest.skip("No unicode file names on this system") - else: + finally: f.close() df = DataFrame( @@ -872,15 +859,15 @@ def test_to_excel_unicode_filename(self, ext, path): ) df.to_excel(filename, "test1", float_format="%.2f") - reader = ExcelFile(filename) - result = pd.read_excel(reader, sheet_name="test1", index_col=0) + with ExcelFile(filename) as reader: + result = pd.read_excel(reader, sheet_name="test1", index_col=0) - expected = DataFrame( - [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], - index=["A", "B"], - columns=["X", "Y", "Z"], - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) # FIXME: dont leave commented-out # def test_to_excel_header_styling_xls(self, engine, ext): @@ -1195,9 +1182,9 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd - engine = "odf" if path.endswith("ods") else "xlrd" - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) + if path.endswith("xlsx") or path.endswith("xlsm"): + pytest.skip("Defaults to openpyxl and fails - GH #38644") + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) @@ -1374,8 +1361,8 @@ def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) - xl = ExcelFile(path) - result = os.fspath(xl) + with ExcelFile(path) as xl: + result = os.fspath(xl) assert result == path def test_excelwriter_fspath(self): diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index f2fbcbc2e2f04..1b4458d0437a1 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.tests.io.excel import xlrd_version from pandas.io.excel import ExcelFile @@ -17,6 +18,8 @@ def skip_ods_and_xlsb_files(read_ext): pytest.skip("Not valid for xlrd") if read_ext == ".xlsb": pytest.skip("Not valid for xlrd") + if read_ext in (".xlsx", ".xlsm") and xlrd_version >= "2": + pytest.skip("Not valid for xlrd >= 2.0") def test_read_xlrd_book(read_ext, frame): @@ -40,9 +43,10 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") + msg = "Worksheet named 'invalid_sheet_name' not found" with ExcelFile(path, engine="xlrd") as excel: - with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, sheet_name="asdf") + with pytest.raises(ValueError, match=msg): + pd.read_excel(excel, sheet_name="invalid_sheet_name") def test_excel_file_warning_with_xlsx_file(datapath): @@ -66,7 +70,7 @@ def test_excel_file_warning_with_xlsx_file(datapath): pd.read_excel(path, "Sheet1", engine=None) -def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): +def test_read_excel_warning_with_xlsx_file(datapath): # GH 29375 path = datapath("io", "data", "excel", "test1.xlsx") has_openpyxl = ( @@ -76,12 +80,19 @@ def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): is not None ) if not has_openpyxl: - with tm.assert_produces_warning( - FutureWarning, - raise_on_extra_warnings=False, - match="The xlrd engine is no longer maintained", - ): - pd.read_excel(path, "Sheet1", engine=None) + if xlrd_version >= "2": + with pytest.raises( + ValueError, + match="Your version of xlrd is ", + ): + pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) else: with tm.assert_produces_warning(None): pd.read_excel(path, "Sheet1", engine=None) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index fe85849c6dcca..b0b07045a9156 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2002,6 +2002,25 @@ def test_float_trim_zeros(self): assert ("+10" in line) or skip skip = False + @pytest.mark.parametrize( + "data, expected", + [ + (["3.50"], "0 3.50\ndtype: object"), + ([1.20, "1.00"], "0 1.2\n1 1.00\ndtype: object"), + ([np.nan], "0 NaN\ndtype: float64"), + ([None], "0 None\ndtype: object"), + (["3.50", np.nan], "0 3.50\n1 NaN\ndtype: object"), + ([3.50, np.nan], "0 3.5\n1 NaN\ndtype: float64"), + ([3.50, np.nan, "3.50"], "0 3.5\n1 NaN\n2 3.50\ndtype: object"), + ([3.50, None, "3.50"], "0 3.5\n1 None\n2 3.50\ndtype: object"), + ], + ) + def test_repr_str_float_truncation(self, data, expected): + # GH#38708 + series = Series(data) + result = repr(series) + assert result == expected + def test_dict_entries(self): df = DataFrame({"A": [{"a": 1, "b": 2}]}) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 64fe8a7730ae2..0bb422658df25 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1411,7 +1411,7 @@ def test_mi_sparse(self): "display_value": "a", "is_visible": True, "type": "th", - "attributes": ["rowspan=2"], + "attributes": ['rowspan="2"'], "class": "row_heading level0 row0", "id": "level0_row0", } @@ -1740,6 +1740,15 @@ def test_colspan_w3(self): s = Styler(df, uuid="_", cell_ids=False) assert 'l0' in s.render() + def test_rowspan_w3(self): + # GH 38533 + df = DataFrame(data=[[1, 2]], index=[["l0", "l0"], ["l1a", "l1b"]]) + s = Styler(df, uuid="_", cell_ids=False) + assert ( + 'l0' in s.render() + ) + @pytest.mark.parametrize("len_", [1, 5, 32, 33, 100]) def test_uuid_len(self, len_): # GH 36345 diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a9673ded7c377..ef4de5961a696 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -545,12 +545,12 @@ def test_to_csv_zip_arguments(self, compression, archive_name): df.to_csv( path, compression={"method": compression, "archive_name": archive_name} ) - zp = ZipFile(path) - expected_arcname = path if archive_name is None else archive_name - expected_arcname = os.path.basename(expected_arcname) - assert len(zp.filelist) == 1 - archived_file = os.path.basename(zp.filelist[0].filename) - assert archived_file == expected_arcname + with ZipFile(path) as zp: + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type): @@ -640,3 +640,25 @@ def test_to_csv_encoding_binary_handle(self, mode): handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') + + +def test_to_csv_iterative_compression_name(compression): + # GH 38714 + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + df.to_csv(path, compression=compression, chunksize=1) + tm.assert_frame_equal( + pd.read_csv(path, compression=compression, index_col=0), df + ) + + +def test_to_csv_iterative_compression_buffer(compression): + # GH 38714 + df = tm.makeDataFrame() + with io.BytesIO() as buffer: + df.to_csv(buffer, compression=compression, chunksize=1) + buffer.seek(0) + tm.assert_frame_equal( + pd.read_csv(buffer, compression=compression, index_col=0), df + ) + assert not buffer.closed diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 4f1af132204bb..968ad63eaceef 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -2,9 +2,12 @@ ExcelFormatter is tested implicitly in pandas/tests/io/excel """ +import string import pytest +import pandas.util._test_decorators as td + import pandas._testing as tm from pandas.io.formats.css import CSSWarning @@ -313,3 +316,18 @@ def test_css_to_excel_bad_colors(input_color): with tm.assert_produces_warning(CSSWarning): convert = CSSToExcelConverter() assert expected == convert(css) + + +def tests_css_named_colors_valid(): + upper_hexs = set(map(str.upper, string.hexdigits)) + for color in CSSToExcelConverter.NAMED_COLORS.values(): + assert len(color) == 6 and all(c in upper_hexs for c in color) + + +@td.skip_if_no_mpl +def test_css_named_colors_from_mpl_present(): + from matplotlib.colors import CSS4_COLORS as mpl_colors + + pd_colors = CSSToExcelConverter.NAMED_COLORS + for name, color in mpl_colors.items(): + assert name in pd_colors and pd_colors[name] == color[1:] diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 4bbd81ada995b..099d99507e136 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -252,3 +252,31 @@ def test_readjson_lines_chunks_fileurl(datapath): with pd.read_json(file_url, lines=True, chunksize=1) as url_reader: for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) + + +def test_chunksize_is_incremental(): + # See https://github.com/pandas-dev/pandas/issues/34548 + jsonl = ( + """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}\n""" + * 1000 + ) + + class MyReader: + def __init__(self, contents): + self.read_count = 0 + self.stringio = StringIO(contents) + + def read(self, *args): + self.read_count += 1 + return self.stringio.read(*args) + + def __iter__(self): + self.read_count += 1 + return iter(self.stringio) + + reader = MyReader(jsonl) + assert len(list(pd.read_json(reader, lines=True, chunksize=100))) > 1 + assert reader.read_count > 10 diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index e8893b4c02238..ec098353960d7 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -97,6 +97,33 @@ def python_parser_only(request): return request.param +def _get_all_parser_float_precision_combinations(): + """ + Return all allowable parser and float precision + combinations and corresponding ids. + """ + params = [] + ids = [] + for parser, parser_id in zip(_all_parsers, _all_parser_ids): + for precision in parser.float_precision_choices: + params.append((parser, precision)) + ids.append(f"{parser_id}-{precision}") + + return {"params": params, "ids": ids} + + +@pytest.fixture( + params=_get_all_parser_float_precision_combinations()["params"], + ids=_get_all_parser_float_precision_combinations()["ids"], +) +def all_parsers_all_precisions(request): + """ + Fixture for all allowable combinations of parser + and float precision + """ + return request.param + + _utf_values = [8, 16, 32] _encoding_seps = ["", "-", "_"] diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c8ed0d75b13a2..8871ea7205a46 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -8,13 +8,16 @@ from inspect import signature from io import BytesIO, StringIO import os +from pathlib import Path import platform from urllib.error import URLError +import warnings import numpy as np import pytest from pandas._libs.tslib import Timestamp +from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1258,15 +1261,14 @@ def test_float_parser(all_parsers): tm.assert_frame_equal(result, expected) -def test_scientific_no_exponent(all_parsers): +def test_scientific_no_exponent(all_parsers_all_precisions): # see gh-12215 df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) - parser = all_parsers + parser, precision = all_parsers_all_precisions - for precision in parser.float_precision_choices: - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) @@ -1350,6 +1352,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.node.add_marker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -2340,3 +2371,22 @@ def test_context_manageri_user_provided(all_parsers, datapath): assert False except AssertionError: assert not reader._engine.handles.handle.closed + + +@td.check_file_leaks +def test_open_file(all_parsers): + # GH 39024 + parser = all_parsers + if parser.engine == "c": + pytest.skip() + + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(b"\xe4\na\n1") + + # should not trigger a ResourceWarning + warnings.simplefilter("always", category=ResourceWarning) + with warnings.catch_warnings(record=True) as record: + with pytest.raises(csv.Error, match="Could not determine delimiter"): + parser.read_csv(file, sep=None) + assert len(record) == 0, record[0].message diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 97f82b9a01a9a..11e14ac61a831 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -208,6 +208,7 @@ def test_read_s3_fails(self, s3so): with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise @@ -223,6 +224,7 @@ def test_write_s3_csv_fails(self, tips_df, s3so): "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so ) + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) @td.skip_if_no("pyarrow") def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 diff --git a/pandas/tests/io/pytables/__init__.py b/pandas/tests/io/pytables/__init__.py index fb4b317a5e977..d3735f8863c3b 100644 --- a/pandas/tests/io/pytables/__init__.py +++ b/pandas/tests/io/pytables/__init__.py @@ -6,4 +6,7 @@ "ignore:a closed node found in the registry:UserWarning" ), pytest.mark.filterwarnings(r"ignore:tostring\(\) is deprecated:DeprecationWarning"), + pytest.mark.filterwarnings( + r"ignore:`np\.object` is a deprecated alias:DeprecationWarning" + ), ] diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index b35414724d946..7b3b01aef8244 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2358,17 +2358,13 @@ def test_series(self, setup_path): ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - with tm.assert_produces_warning(FutureWarning): - # auto-casting object->DatetimeIndex deprecated - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - with tm.assert_produces_warning(FutureWarning): - # auto-casting object->DatetimeIndex deprecated - ts3 = Series( - ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object) - ) - self._check_roundtrip(ts3, tm.assert_series_equal, path=setup_path) + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + self._check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) def test_float_index(self, setup_path): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index c3b21daa0ac04..540f12841de1b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,7 +1,8 @@ """ Tests for the pandas.io.common functionalities """ -from io import StringIO +import codecs +from io import BytesIO, StringIO import mmap import os from pathlib import Path @@ -85,6 +86,13 @@ def test_stringify_path_fspath(self): result = icom.stringify_path(p) assert result == "foo/bar.csv" + def test_stringify_file_and_path_like(self): + # GH 38125: do not stringify file objects that are also path-like + fsspec = pytest.importorskip("fsspec") + with tm.ensure_clean() as path: + with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj: + assert fsspec_obj == icom.stringify_path(fsspec_obj) + @pytest.mark.parametrize( "extension,expected", [ @@ -411,3 +419,57 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + + +def test_default_errors(): + # GH 38989 + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(b"\xe4\na\n1") + tm.assert_frame_equal(pd.read_csv(file, skiprows=[0]), pd.DataFrame({"a": [1]})) + + +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +@pytest.mark.parametrize("format", ["csv", "json"]) +def test_codecs_encoding(encoding, format): + # GH39247 + expected = tm.makeDataFrame() + with tm.ensure_clean() as path: + with codecs.open(path, mode="w", encoding=encoding) as handle: + getattr(expected, f"to_{format}")(handle) + with codecs.open(path, mode="r", encoding=encoding) as handle: + if format == "csv": + df = pd.read_csv(handle, index_col=0) + else: + df = pd.read_json(handle) + tm.assert_frame_equal(expected, df) + + +def test_codecs_get_writer_reader(): + # GH39247 + expected = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, "wb") as handle: + with codecs.getwriter("utf-8")(handle) as encoded: + expected.to_csv(encoded) + with open(path, "rb") as handle: + with codecs.getreader("utf-8")(handle) as encoded: + df = pd.read_csv(encoded, index_col=0) + tm.assert_frame_equal(expected, df) + + +@pytest.mark.parametrize( + "io_class,mode,msg", + [ + (BytesIO, "t", "a bytes-like object is required, not 'str'"), + (StringIO, "b", "string argument expected, got 'bytes'"), + ], +) +def test_explicit_encoding(io_class, mode, msg): + # GH39247; this test makes sure that if a user provides mode="*t" or "*b", + # it is used. In the case of this test it leads to an error as intentionally the + # wrong mode is requested + expected = tm.makeDataFrame() + with io_class() as buffer: + with pytest.raises(TypeError, match=msg): + expected.to_csv(buffer, mode=f"w{mode}") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index ba8b1a8a0679d..aed1aaedf2fa3 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -129,6 +129,7 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) + @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url_positional_match(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" @@ -142,6 +143,7 @@ def test_banklist_url_positional_match(self): assert_framelist_equal(df1, df2) + @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fe3ca0d0937b3..a9357ef89de92 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -671,12 +671,7 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): @pytest.mark.parametrize( "partition_col", [ - pytest.param( - ["A"], - marks=pytest.mark.xfail( - PY38, reason="Getting back empty DataFrame", raises=AssertionError - ), - ), + ["A"], [], ], ) @@ -885,7 +880,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute check_round_trip(df, pa, check_dtype=False) - @td.skip_if_no("pyarrow", min_version="0.17") + @td.skip_if_no("pyarrow", min_version="1.0.0") def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 df = pd.DataFrame({"a": list(range(0, 3))}) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 34b36e2549b62..24844c4f2eb85 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,6 +13,7 @@ import bz2 import datetime import functools +from functools import partial import glob import gzip import io @@ -588,3 +589,14 @@ def test_pickle_preserves_block_ndim(): # GH#37631 OP issue was about indexing, underlying problem was pickle tm.assert_series_equal(res[[True]], ser) + + +@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL]) +def test_pickle_big_dataframe_compression(protocol, compression): + # GH#39002 + df = pd.DataFrame(range(100000)) + result = tm.round_trip_pathlib( + partial(df.to_pickle, protocol=protocol, compression=compression), + partial(pd.read_pickle, compression=compression), + ) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0195b61d13798..16d4bc65094f8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1125,6 +1125,15 @@ def test_query_by_select_obj(self): all_names = set(iris_df["Name"]) assert all_names == {"Iris-setosa"} + def test_column_with_percentage(self): + # GH 37157 + df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]}) + df.to_sql("test_column_percentage", self.conn, index=False) + + res = sql.read_sql_table("test_column_percentage", self.conn) + + tm.assert_frame_equal(res, df) + class _EngineToConnMixin: """ @@ -1185,7 +1194,7 @@ def test_sql_open_close(self): @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") def test_con_string_import_error(self): - conn = "mysql://root@localhost/pandas_nosetest" + conn = "mysql://root@localhost/pandas" msg = "Using URI string without sqlalchemy installed" with pytest.raises(ImportError, match=msg): sql.read_sql("SELECT * FROM iris", conn) @@ -1922,11 +1931,12 @@ class _TestMySQLAlchemy: """ flavor = "mysql" + port = 3306 @classmethod def connect(cls): return sqlalchemy.create_engine( - f"mysql+{cls.driver}://root@localhost/pandas_nosetest", + f"mysql+{cls.driver}://root@localhost:{cls.port}/pandas", connect_args=cls.connect_args, ) @@ -1991,11 +2001,12 @@ class _TestPostgreSQLAlchemy: """ flavor = "postgresql" + port = 5432 @classmethod def connect(cls): return sqlalchemy.create_engine( - f"postgresql+{cls.driver}://postgres@localhost/pandas_nosetest" + f"postgresql+{cls.driver}://postgres:postgres@localhost:{cls.port}/pandas" ) @classmethod @@ -2611,7 +2622,7 @@ class TestXMySQL(MySQLMixIn): @pytest.fixture(autouse=True, scope="class") def setup_class(cls): pymysql = pytest.importorskip("pymysql") - pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas") try: pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError as err: @@ -2631,7 +2642,7 @@ def setup_class(cls): @pytest.fixture(autouse=True) def setup_method(self, request, datapath): pymysql = pytest.importorskip("pymysql") - pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas") try: pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError as err: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 24944281419c3..0f9321fd4f96a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -550,6 +550,7 @@ def test_invalid_timestamp(self, version): msg = "time_stamp should be datetime type" with pytest.raises(ValueError, match=msg): original.to_stata(path, time_stamp=time_stamp, version=version) + assert not os.path.isfile(path) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) @@ -1916,10 +1917,10 @@ def test_compression_dict(method, file_ext): compression = {"method": method, "archive_name": archive_name} df.to_stata(path, compression=compression) if method == "zip" or file_ext == "zip": - zp = zipfile.ZipFile(path, "r") - assert len(zp.filelist) == 1 - assert zp.filelist[0].filename == archive_name - fp = io.BytesIO(zp.read(zp.filelist[0])) + with zipfile.ZipFile(path, "r") as zp: + assert len(zp.filelist) == 1 + assert zp.filelist[0].filename == archive_name + fp = io.BytesIO(zp.read(zp.filelist[0])) else: fp = path reread = read_stata(fp, index_col="index") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index c66334065ea63..68e693cdb85e2 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2176,80 +2176,6 @@ def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel): assert ax.get_xlabel() == (xcol if xlabel is None else xlabel) assert ax.get_ylabel() == (ycol if ylabel is None else ylabel) - @pytest.mark.parametrize("method", ["bar", "barh"]) - def test_bar_ticklabel_consistence(self, method): - # Draw two consecutiv bar plot with consistent ticklabels - # The labels positions should not move between two drawing on the same axis - # GH: 26186 - def get_main_axis(ax): - if method == "barh": - return ax.yaxis - elif method == "bar": - return ax.xaxis - - # Plot the first bar plot - data = {"A": 0, "B": 3, "C": -4} - df = DataFrame.from_dict(data, orient="index", columns=["Value"]) - ax = getattr(df.plot, method)() - ax.get_figure().canvas.draw() - - # Retrieve the label positions for the first drawing - xticklabels = [t.get_text() for t in get_main_axis(ax).get_ticklabels()] - label_positions_1 = dict(zip(xticklabels, get_main_axis(ax).get_ticklocs())) - - # Modify the dataframe order and values and plot on same axis - df = df.sort_values("Value") * -2 - ax = getattr(df.plot, method)(ax=ax, color="red") - ax.get_figure().canvas.draw() - - # Retrieve the label positions for the second drawing - xticklabels = [t.get_text() for t in get_main_axis(ax).get_ticklabels()] - label_positions_2 = dict(zip(xticklabels, get_main_axis(ax).get_ticklocs())) - - # Assert that the label positions did not change between the plotting - assert label_positions_1 == label_positions_2 - - def test_bar_numeric(self): - # Bar plot with numeric index have tick location values equal to index - # values - # GH: 11465 - df = DataFrame(np.random.rand(10), index=np.arange(10, 20)) - ax = df.plot.bar() - ticklocs = ax.xaxis.get_ticklocs() - expected = np.arange(10, 20, dtype=np.int64) - tm.assert_numpy_array_equal(ticklocs, expected) - - def test_bar_multiindex(self): - # Test from pandas/doc/source/user_guide/visualization.rst - # at section Plotting With Error Bars - # Related to issue GH: 26186 - - ix3 = pd.MultiIndex.from_arrays( - [ - ["a", "a", "a", "a", "b", "b", "b", "b"], - ["foo", "foo", "bar", "bar", "foo", "foo", "bar", "bar"], - ], - names=["letter", "word"], - ) - - df3 = DataFrame( - {"data1": [3, 2, 4, 3, 2, 4, 3, 2], "data2": [6, 5, 7, 5, 4, 5, 6, 5]}, - index=ix3, - ) - - # Group by index labels and take the means and standard deviations - # for each group - gp3 = df3.groupby(level=("letter", "word")) - means = gp3.mean() - errors = gp3.std() - - # No assertion we just ensure that we can plot a MultiIndex bar plot - # and are getting a UserWarning if redrawing - with tm.assert_produces_warning(None): - ax = means.plot.bar(yerr=errors, capsize=4) - with tm.assert_produces_warning(UserWarning): - means.plot.bar(yerr=errors, capsize=4, ax=ax) - def _generate_4_axes_via_gridspec(): import matplotlib as mpl diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 397a064f6adad..66a4f9598c49b 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -277,16 +277,9 @@ def test_irreg_hf(self): _, ax = self.plt.subplots() df2 = df.copy() df2.index = df.index.astype(object) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # This warning will be emitted - # pandas/core/frame.py:3216: - # FutureWarning: Automatically casting object-dtype Index of datetimes - # to DatetimeIndex is deprecated and will be removed in a future version. - # Explicitly cast to DatetimeIndex instead. - # return klass(values, index=self.index, name=name, fastpath=True) - df2.plot(ax=ax) - diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() - assert (np.fabs(diffs[1:] - sec) < 1e-8).all() + df2.plot(ax=ax) + diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() + assert (np.fabs(diffs[1:] - sec) < 1e-8).all() def test_irregular_datetime64_repr_bug(self): ser = tm.makeTimeSeries() @@ -997,16 +990,9 @@ def test_irreg_dtypes(self): # np.datetime64 idx = date_range("1/1/2000", periods=10) idx = idx[[0, 2, 5, 9]].astype(object) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - # This warning will be emitted - # pandas/core/frame.py:3216: - # FutureWarning: Automatically casting object-dtype Index of datetimes - # to DatetimeIndex is deprecated and will be removed in a future version. - # Explicitly cast to DatetimeIndex instead. - # return klass(values, index=self.index, name=name, fastpath=True) - df = DataFrame(np.random.randn(len(idx), 3), idx) - _, ax = self.plt.subplots() - _check_plot_works(df.plot, ax=ax) + df = DataFrame(np.random.randn(len(idx), 3), idx) + _, ax = self.plt.subplots() + _check_plot_works(df.plot, ax=ax) def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8c2297699807d..94afa204db891 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -17,6 +17,7 @@ Timedelta, TimedeltaIndex, Timestamp, + date_range, isna, timedelta_range, to_timedelta, @@ -923,6 +924,48 @@ def test_any_axis1_bool_only(self): expected = Series([True, False]) tm.assert_series_equal(result, expected) + def test_any_all_datetimelike(self): + # GH#38723 these may not be the desired long-term behavior (GH#34479) + # but in the interim should be internally consistent + dta = date_range("1995-01-02", periods=3)._data + ser = Series(dta) + df = DataFrame(ser) + + assert dta.all() + assert dta.any() + + assert ser.all() + assert ser.any() + + assert df.any().all() + assert df.all().all() + + dta = dta.tz_localize("UTC") + ser = Series(dta) + df = DataFrame(ser) + + assert dta.all() + assert dta.any() + + assert ser.all() + assert ser.any() + + assert df.any().all() + assert df.all().all() + + tda = dta - dta[0] + ser = Series(tda) + df = DataFrame(ser) + + assert tda.any() + assert not tda.all() + + assert ser.any() + assert not ser.all() + + assert df.any().all() + assert not df.all().any() + def test_timedelta64_analytics(self): # index min/max diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 15dd49f8bf182..da5bb0eb59f70 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -362,3 +362,39 @@ def test_apply_to_one_column_of_df(): tm.assert_series_equal(result, expected) result = df.resample("H").apply(lambda group: group["col"].sum()) tm.assert_series_equal(result, expected) + + +def test_resample_groupby_agg(): + # GH: 33548 + df = DataFrame( + { + "cat": [ + "cat_1", + "cat_1", + "cat_2", + "cat_1", + "cat_2", + "cat_1", + "cat_2", + "cat_1", + ], + "num": [5, 20, 22, 3, 4, 30, 10, 50], + "date": [ + "2019-2-1", + "2018-02-03", + "2020-3-11", + "2019-2-2", + "2019-2-2", + "2018-12-4", + "2020-3-11", + "2020-12-12", + ], + } + ) + df["date"] = pd.to_datetime(df["date"]) + + resampled = df.groupby("cat").resample("Y", on="date") + expected = resampled.sum() + result = resampled.agg({"num": "sum"}) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index babc8124877e9..295846ee1b264 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -167,14 +167,3 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] - - def test_concat_duplicate_indexes(self): - # GH#36263 ValueError with non unique indexes - df1 = DataFrame([1, 2, 3, 4], index=[0, 1, 1, 4], columns=["a"]) - df2 = DataFrame([6, 7, 8, 9], index=[0, 0, 1, 3], columns=["b"]) - result = concat([df1, df2], axis=1) - expected = DataFrame( - {"a": [1, 1, 2, 3, np.nan, 4], "b": [6, 7, 8, 8, 9, np.nan]}, - index=Index([0, 0, 1, 1, 3, 4]), - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f43ae58fbcc2f..e4aab4d1ef92c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2349,3 +2349,20 @@ def test_merge_join_cols_error_reporting_on_and_index(func, kwargs): ) with pytest.raises(MergeError, match=msg): getattr(pd, func)(left, right, on="a", **kwargs) + + +def test_merge_right_left_index(): + # GH#38616 + left = DataFrame({"x": [1, 1], "z": ["foo", "foo"]}) + right = DataFrame({"x": [1, 1], "z": ["foo", "foo"]}) + result = pd.merge(left, right, how="right", left_index=True, right_on="x") + expected = DataFrame( + { + "x": [1, 1], + "x_x": [1, 1], + "z_x": ["foo", "foo"], + "x_y": [1, 1], + "z_y": ["foo", "foo"], + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 2ea7602b00206..20de0effc30e1 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -575,6 +575,40 @@ def test_nat_comparisons_invalid(other, op): op(other, NaT) +def test_compare_date(): + # GH#39151 comparing NaT with date object is deprecated + # See also: tests.scalar.timestamps.test_comparisons::test_compare_date + + dt = Timestamp.now().to_pydatetime().date() + + for left, right in [(NaT, dt), (dt, NaT)]: + assert not left == right + assert left != right + + with tm.assert_produces_warning(FutureWarning): + assert not left < right + with tm.assert_produces_warning(FutureWarning): + assert not left <= right + with tm.assert_produces_warning(FutureWarning): + assert not left > right + with tm.assert_produces_warning(FutureWarning): + assert not left >= right + + # Once the deprecation is enforced, the following assertions + # can be enabled: + # assert not left == right + # assert left != right + # + # with pytest.raises(TypeError): + # left < right + # with pytest.raises(TypeError): + # left <= right + # with pytest.raises(TypeError): + # left > right + # with pytest.raises(TypeError): + # left >= right + + @pytest.mark.parametrize( "obj", [ diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index aaa58cdb390f7..c2219e9fd45a6 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import numpy as np import pytest @@ -13,6 +13,7 @@ Series, Timedelta, Timestamp, + date_range, isna, ) import pandas._testing as tm @@ -711,6 +712,14 @@ def test_fillna_method_and_limit_invalid(self): with pytest.raises(ValueError, match=msg): ser.fillna(1, limit=limit, method=method) + def test_fillna_datetime64_with_timezone_tzinfo(self): + # https://github.com/pandas-dev/pandas/issues/38851 + s = Series(date_range("2020", periods=3, tz="UTC")) + expected = s.astype(object) + s[1] = NaT + result = s.fillna(datetime(2020, 1, 2, tzinfo=timezone.utc)) + tm.assert_series_equal(result, expected) + class TestFillnaPad: def test_fillna_bug(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5b13091470b09..8e0d2193ad999 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1614,8 +1614,7 @@ def test_constructor_infer_index_tz(self): class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): idx = tm.makeDateIndex(10000) - with tm.assert_produces_warning(FutureWarning): - ser = Series(np.random.randn(len(idx)), idx.astype(object)) + ser = Series(np.random.randn(len(idx)), idx.astype(object)) with tm.assert_produces_warning(FutureWarning): assert ser.index.is_all_dates assert isinstance(ser.index, DatetimeIndex) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 75e7f8a17eda3..836135c3d6310 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -184,9 +184,7 @@ def test_timeseries_repr_object_dtype(self): index = Index( [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object ) - with tm.assert_produces_warning(FutureWarning): - # Index.is_all_dates deprecated - ts = Series(np.random.randn(len(index)), index) + ts = Series(np.random.randn(len(index)), index) repr(ts) ts = tm.makeTimeSeries(1000) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 35411d7e9cfb7..ac97ff7af262d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2410,14 +2410,9 @@ def test_diff_ea_axis(self): with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) - -@pytest.mark.parametrize( - "left_values", [[0, 1, 1, 4], [0, 1, 1, 4, 4], [0, 1, 1, 1, 4]] -) -def test_make_duplicates_of_left_unique_in_right(left_values): - # GH#36263 - left = np.array(left_values) - right = np.array([0, 0, 1, 1, 4]) - result = algos.make_duplicates_of_left_unique_in_right(left, right) - expected = np.array([0, 0, 1, 4]) - tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["int8", "int16"]) + def test_diff_low_precision_int(self, dtype): + arr = np.array([0, 1, 1, 0, 0], dtype=dtype) + result = algos.diff(arr, 1) + expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32") + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 538a52d84b73a..a15b2d03079d4 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3670,3 +3670,11 @@ def test_str_get_stringarray_multiple_nans(): result = s.str.get(2) expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) tm.assert_series_equal(result, expected) + + +def test_str_accessor_in_apply_func(): + # https://github.com/pandas-dev/pandas/issues/38979 + df = DataFrame(zip("abc", "def")) + expected = Series(["A/D", "B/E", "C/F"]) + result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 8d9b54cf3f0df..edb0f8c7dd662 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -10,6 +10,7 @@ import warnings from hypothesis import assume, given, strategies as st +from hypothesis.errors import Flaky from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones import pytest @@ -103,6 +104,7 @@ def test_on_offset_implementations(dt, offset): assert offset.is_on_offset(dt) == (compare == dt) +@pytest.mark.xfail(strict=False, raises=Flaky, reason="unreliable test timings") @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 8034ace479a62..f8539e9031d28 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -299,3 +299,25 @@ def test_allows_duplicate_labels(): with pytest.raises(AssertionError, match=" Optional[str]: @@ -83,7 +83,7 @@ def _get_dependency_info() -> Dict[str, JSONSerializable]: mod = import_optional_dependency( modname, raise_on_missing=False, on_version="ignore" ) - result[modname] = _get_version(mod) if mod else None + result[modname] = get_version(mod) if mod else None return result diff --git a/requirements-dev.txt b/requirements-dev.txt index 17ca6b8401501..33a315884612d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.16.5 +numpy>=1.16.5, <1.20 python-dateutil>=2.7.3 pytz asv @@ -42,7 +42,7 @@ pytest-instafail seaborn statsmodels ipywidgets -nbformat +nbformat==5.0.8 notebook>=5.7.5 pip blosc diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 9c58a55cb907e..8f48d518a737b 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -29,7 +29,6 @@ "_doc_template", "_agg_template", "_pipe_template", - "_get_version", "__main__", "_transform_template", "_flex_comp_doc_FRAME", diff --git a/setup.py b/setup.py index 0b1007794bbdb..f9c4a1158fee0 100755 --- a/setup.py +++ b/setup.py @@ -421,6 +421,8 @@ def run(self): extra_compile_args.append("-Werror") if debugging_symbols_requested: extra_compile_args.append("-g") + extra_compile_args.append("-UNDEBUG") + extra_compile_args.append("-O0") # Build for at least macOS 10.9 when compiling on a 10.9 system or above, # overriding CPython distuitls behaviour which is to target the version that @@ -433,7 +435,7 @@ def run(self): "MACOSX_DEPLOYMENT_TARGET", current_system ) if ( - LooseVersion(python_target) < "10.9" + LooseVersion(str(python_target)) < "10.9" and LooseVersion(current_system) >= "10.9" ): os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" diff --git a/test_fast.bat b/test_fast.bat index f2c4e9fa71fcd..34c61fea08ab4 100644 --- a/test_fast.bat +++ b/test_fast.bat @@ -1,3 +1,3 @@ :: test on windows set PYTHONHASHSEED=314159265 -pytest --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sXX --strict pandas +pytest --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sXX --strict-markers pandas diff --git a/test_fast.sh b/test_fast.sh index 0a47f9de600ea..6444b81b3c6da 100755 --- a/test_fast.sh +++ b/test_fast.sh @@ -5,4 +5,4 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -pytest pandas --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sxX --strict "$@" +pytest pandas --skip-slow --skip-network --skip-db -m "not single" -n 4 -r sxX --strict-markers "$@"