diff --git a/README.md b/README.md index a72e8402e68a0..a2f2f1c04442a 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`) in floating point as well as non-floating point data + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional objects diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 4583fac85b776..1863a17e3d5f7 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -26,7 +26,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["2.7", "3.4"], - "pythons": ["3.6"], + "pythons": ["3.8"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a6552aa096a22..cc996f4077cd9 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -21,6 +21,7 @@ dependencies: - lxml - matplotlib>=3.3.0 - moto + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index e8ffd3d74ca5e..d17a8a2b0ed9b 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -27,9 +27,11 @@ dependencies: - python-dateutil - pytz - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - moto + - flask diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index c466a5929ea29..bb40127b672d3 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -6,14 +6,15 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 + - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-asyncio + - pytest-asyncio>=0.12.0 - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies - beautifulsoup4 + - flask - html5lib - ipython - jinja2 @@ -32,6 +33,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - moto - pyarrow>=0.15 - pip - pip: diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index f4c238ab8b173..1d15ca41c0f8e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -8,20 +8,21 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21,<2.0.0 # GH 35737 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.7.4 + - fsspec>=0.8.0 - gcsfs>=0.6.0 - html5lib - jinja2 - lxml - matplotlib=2.2.* - - moto + - moto>=1.3.14 + - flask - numexpr - numpy=1.16.* - openpyxl @@ -29,7 +30,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.2 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 1f383164b5328..23bede5eb26f1 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21,<2.0.0 # GH 35737 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -16,7 +16,10 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 + - flask + - fsspec>=0.8.0 - matplotlib=3.1.3 + - moto>=1.3.14 - numba - numexpr - numpy=1.18.* @@ -26,6 +29,7 @@ dependencies: - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 5cb53489be225..ea29cbef1272b 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -17,5 +17,6 @@ dependencies: - python-dateutil - pytz - pip + - flask - pip: - moto diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index edc11bdf4ab35..33ee6dfffb1a3 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -23,7 +23,8 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 4427c1d940bf2..306f74a0101e3 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -21,13 +21,14 @@ dependencies: - jinja2 - lxml=4.3.0 - matplotlib=3.0.* - - moto - nomkl - numexpr - numpy - openpyxl - pandas-gbq=0.12.0 + - pyarrow>=0.17 - psycopg2=2.7 + - pyarrow>=0.15.0 # GH #35813 - pymysql=0.7.11 - pytables - python-dateutil diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index e896233aac63c..26d6c2910a7cc 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -20,8 +20,8 @@ dependencies: - pyarrow - pytz - s3fs>=0.4.0 + - moto>=1.3.14 + - flask - tabulate - pyreadstat - pip - - pip: - - moto diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 4ffd1d586a99a..e5c6f77eea3ef 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -204,6 +204,7 @@ You will need `Build Tools for Visual Studio 2017 You DO NOT need to install Visual Studio 2019. You only need "Build Tools for Visual Studio 2019" found by scrolling down to "All downloads" -> "Tools for Visual Studio 2019". + In the installer, select the "C++ build tools" workload. **Mac OS** diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index d331491d02883..efee21b5889ed 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -53,6 +53,32 @@ need to implement certain operations expected by pandas users (for example the algorithm used in, ``Series.str.upper``). That work may be done outside of pandas. +Consistent missing value handling +--------------------------------- + +Currently, pandas handles missing data differently for different data types. We +use different types to indicate that a value is missing (``np.nan`` for +floating-point data, ``np.nan`` or ``None`` for object-dtype data -- typically +strings or booleans -- with missing values, and ``pd.NaT`` for datetimelike +data). Integer data cannot store missing data or are cast to float. In addition, +pandas 1.0 introduced a new missing value sentinel, ``pd.NA``, which is being +used for the experimental nullable integer, boolean, and string data types. + +These different missing values have different behaviors in user-facing +operations. Specifically, we introduced different semantics for the nullable +data types for certain operations (e.g. propagating in comparison operations +instead of comparing as False). + +Long term, we want to introduce consistent missing data handling for all data +types. This includes consistent behavior in all operations (indexing, arithmetic +operations, comparisons, etc.). We want to eventually make the new semantics the +default. + +This has been discussed at +`github #28095 `__ (and +linked issues), and described in more detail in this +`design doc `__. + Apache Arrow interoperability ----------------------------- diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index d8a40c5406dee..032ba73a7293d 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -9,9 +9,9 @@ Package overview **pandas** is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real world** data +fundamental high-level building block for doing practical, **real-world** data analysis in Python. Additionally, it has the broader goal of becoming **the -most powerful and flexible open source data analysis / manipulation tool +most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. pandas is well suited for many different kinds of data: @@ -21,7 +21,7 @@ pandas is well suited for many different kinds of data: - Ordered and unordered (not necessarily fixed-frequency) time series data. - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels - - Any other form of observational / statistical data sets. The data actually + - Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure The two primary data structures of pandas, :class:`Series` (1-dimensional) @@ -57,7 +57,7 @@ Here are just a few of the things that pandas does well: Excel files, databases, and saving / loading data from the ultrafast **HDF5 format** - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, date shifting and lagging. + conversion, moving window statistics, date shifting, and lagging. Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments. For data @@ -101,12 +101,12 @@ fashion. Also, we would like sensible default behaviors for the common API functions which take into account the typical orientation of time series and -cross-sectional data sets. When using ndarrays to store 2- and 3-dimensional +cross-sectional data sets. When using the N-dimensional array (ndarrays) to store 2- and 3-dimensional data, a burden is placed on the user to consider the orientation of the data set when writing functions; axes are considered more or less equivalent (except when C- or Fortran-contiguousness matters for performance). In pandas, the axes are intended to lend more semantic meaning to the data; i.e., for a particular -data set there is likely to be a "right" way to orient the data. The goal, +data set, there is likely to be a "right" way to orient the data. The goal, then, is to reduce the amount of mental effort required to code up data transformations in downstream functions. @@ -148,8 +148,8 @@ pandas possible. Thanks to `all of our contributors `. pandas is a `NumFOCUS `__ sponsored project. -This will help ensure the success of development of pandas as a world-class open-source -project, and makes it possible to `donate `__ to the project. +This will help ensure the success of the development of pandas as a world-class open-source +project and makes it possible to `donate `__ to the project. Project governance ------------------ diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4c2d0621c6103..b8940d2efed2f 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -94,4 +94,4 @@ Various tutorials * `Intro to pandas data structures, by Greg Reda `_ * `Pandas and Python: Top 10, by Manish Amde `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ -* `A concise tutorial with real life examples `_ +* `A concise tutorial with real life examples `_ diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..4d9d18e3d204e 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -343,6 +343,7 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst DataFrame.sparse.from_spmatrix DataFrame.sparse.to_coo diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..ae3e121ca8212 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -522,6 +522,7 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.sparse.from_coo Series.sparse.to_coo diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index ca8e9a2f313f6..35e0e0fb86472 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -87,14 +87,15 @@ The :attr:`SparseArray.dtype` property stores two pieces of information sparr.dtype -A :class:`SparseDtype` may be constructed by passing each of these +A :class:`SparseDtype` may be constructed by passing only a dtype .. ipython:: python pd.SparseDtype(np.dtype('datetime64[ns]')) -The default fill value for a given NumPy dtype is the "missing" value for that dtype, -though it may be overridden. +in which case a default fill value will be used (for NumPy dtypes this is often the +"missing" value for that dtype). To override this default an explicit fill value may be +passed instead .. ipython:: python diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index a03ba6c775e68..0bfe9d9b68cdb 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2319,13 +2319,18 @@ you can use the ``tz_convert`` method. Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. +.. warning:: + + Be aware that for times in the future, correct conversion between time zones + (and UTC) cannot be guaranteed by any time zone library because a timezone's + offset from UTC may be changed by the respective government. + .. warning:: If you are using dates beyond 2038-01-18, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, - the DST transitions will be applied. It should be noted though, that time zone data for far future time zones - are likely to be inaccurate, as they are simple extrapolations of the current set of (regularly revised) rules. + the DST transitions will be applied. For example, for two dates that are in British Summer Time (and so would normally be GMT+1), both the following asserts evaluate as true: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 5bc87bca87211..8ce4b30c717a4 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -668,6 +668,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. plt.figure() .. ipython:: python + :okwarning: series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') @@ -742,6 +743,7 @@ If you pass values whose sum total is less than 1.0, matplotlib draws a semicirc plt.figure() .. ipython:: python + :okwarning: series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index a280a981c789b..1827d151579a1 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.2 v1.1.1 v1.1.0 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3cd920158f774..0f0f009307c75 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -540,7 +540,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 .. ipython:: python - df.describe() + df.describe() ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ff5bbccf63ffe..77ea67f76f655 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_111: -What's new in 1.1.1 (?) ------------------------ +What's new in 1.1.1 (August 20, 2020) +------------------------------------- These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog including other versions of pandas. @@ -15,20 +15,23 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) +- Fixed regression in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) -- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) -- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). +- Fixed regression in ``.groupby(..).rolling(..)`` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) -- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) -- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) .. --------------------------------------------------------------------------- @@ -37,50 +40,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). -- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) - - -**Datetimelike** - -- -- - -**Timedelta** - -- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) - - -**Numeric** - -- -- - -**Groupby/resample/rolling** - -- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) - -**Plotting** - -- - -**Indexing** - -- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - -**DataFrame** -- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) -- - -**Strings** - -- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) - +- Bug in :class:`~pandas.io.formats.style.Styler` whereby ``cell_ids`` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) +- Bug in :meth:`to_timedelta` fails when ``arg`` is a :class:`Series` with ``Int64`` dtype containing null values (:issue:`35574`) +- Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when ``data`` and ``index`` have mismatched lengths (:issue:`33437`) .. --------------------------------------------------------------------------- @@ -89,4 +53,4 @@ Categorical Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.0..v1.1.1|HEAD +.. contributors:: v1.1.0..v1.1.1 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst new file mode 100644 index 0000000000000..9b1ad658d4666 --- /dev/null +++ b/doc/source/whatsnew/v1.1.2.rst @@ -0,0 +1,43 @@ +.. _whatsnew_112: + +What's new in 1.1.2 (??) +------------------------ + +These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) +- Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) +- Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) +- + + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) +- Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) +- Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) +- Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.1..v1.1.2|HEAD diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 42f95d88d74ac..1617bf66c4f04 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -24,6 +24,9 @@ of the individual storage backends (detailed from the fsspec docs for `builtin implementations`_ and linked to `external ones`_). See Section :ref:`io.remote`. +:issue:`35655` added fsspec support (including ``storage_options``) +for reading excel files. + .. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations .. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations @@ -140,7 +143,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- +- Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) - .. --------------------------------------------------------------------------- @@ -151,7 +154,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - .. --------------------------------------------------------------------------- @@ -249,15 +252,18 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) -- -- +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) +- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) +- Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) +- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Sparse diff --git a/environment.yml b/environment.yml index 1e51470d43d36..96f2c8d2086c7 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,7 @@ channels: - conda-forge dependencies: # required - # Pin numpy<1.19 until MPL 3.3.0 is released. - - numpy>=1.16.5,<1.19.0 + - numpy>=1.16.5 - python=3 - python-dateutil>=2.7.3 - pytz @@ -27,8 +26,8 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - - gitdb2=2.0.6 # GH-32060 - - sphinx<=3.1.1 + - gitdb + - sphinx # documentation (jupyter notebooks) - nbconvert>=5.4.1 @@ -52,6 +51,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 + - flask - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 @@ -73,7 +73,7 @@ dependencies: - ipykernel - ipython>=7.11.1 - jinja2 # pandas.Styler - - matplotlib>=2.2.2,<3.3.0 # pandas.plotting, Series.plot, DataFrame.plot + - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.2 - numba>=0.46.0 diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d6659cc1895b1..569562f5b5037 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -80,7 +80,11 @@ cdef class IndexEngine: values = self._get_index_values() self._check_type(val) - loc = _bin_search(values, val) # .searchsorted(val, side='left') + try: + loc = _bin_search(values, val) # .searchsorted(val, side='left') + except TypeError: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 6867e8aba7411..40bd5ad8f5a1f 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -358,6 +358,11 @@ cdef class Interval(IntervalMixin): self_tuple = (self.left, self.right, self.closed) other_tuple = (other.left, other.right, other.closed) return PyObject_RichCompare(self_tuple, other_tuple, op) + elif util.is_array(other): + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return NotImplemented diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7f0314d737619..161e5f4e54f51 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3705,7 +3705,7 @@ cdef inline void _shift_months(const int64_t[:] dtindex, """See shift_months.__doc__""" cdef: Py_ssize_t i - int months_to_roll, compare_day + int months_to_roll npy_datetimestruct dts for i in range(count): @@ -3715,10 +3715,8 @@ cdef inline void _shift_months(const int64_t[:] dtindex, dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) + months_to_roll = _roll_qtrday(&dts, months_to_roll, 0, day_opt) dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) diff --git a/pandas/_testing.py b/pandas/_testing.py index ef6232fa6d575..b402b040d9268 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -939,7 +939,7 @@ def assert_categorical_equal( if check_category_order: assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes" ) else: try: @@ -948,9 +948,7 @@ def assert_categorical_equal( except TypeError: # e.g. '<' not supported between instances of 'int' and 'str' lc, rc = left.categories, right.categories - assert_index_equal( - lc, rc, obj=f"{obj}.categories", - ) + assert_index_equal(lc, rc, obj=f"{obj}.categories") assert_index_equal( left.categories.take(left.codes), right.categories.take(right.codes), @@ -1092,7 +1090,7 @@ def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, + obj, f"{obj} shapes are different", left.shape, right.shape ) diff = 0 @@ -1559,7 +1557,7 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" ) if check_like: @@ -2884,7 +2882,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): return expected -def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: +def external_error_raised(expected_exception: Type[Exception]) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/conftest.py b/pandas/conftest.py index 97cc514e31bb3..0878380d00837 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1181,7 +1181,13 @@ def ip(): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell - return InteractiveShell() + # GH#35711 make sure sqlite history file handle is not leaked + from traitlets.config import Config # noqa: F401 isort:skip + + c = Config() + c.HistoryManager.hist_file = ":memory:" + + return InteractiveShell(config=c) @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 891048ae82dfd..7ca68d8289bd5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -10,6 +10,7 @@ Callable, DefaultDict, Dict, + Iterable, List, Optional, Sequence, @@ -17,21 +18,19 @@ Union, ) -from pandas._typing import AggFuncType, Label +from pandas._typing import AggFuncType, FrameOrSeries, Label from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.series import FrameOrSeriesUnion, Series +from pandas.core.series import Series def reconstruct_func( - func: Optional[AggFuncType], **kwargs, -) -> Tuple[ - bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], -]: + func: Optional[AggFuncType], **kwargs +) -> Tuple[bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -278,12 +277,13 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: def relabel_result( - result: FrameOrSeriesUnion, + result: FrameOrSeries, func: Dict[str, List[Union[Callable, str]]], - columns: Tuple, - order: List[int], + columns: Iterable[Label], + order: Iterable[int], ) -> Dict[Label, Series]: - """Internal function to reorder result if relabelling is True for + """ + Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. Parameters: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index befde7c355818..6d6bb21165814 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,7 @@ import numpy as np from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib -from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj +from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -58,7 +58,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Series + from pandas import DataFrame, Series _shared_docs: Dict[str, str] = {} @@ -462,7 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def _factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -1101,6 +1101,9 @@ def __init__(self, obj, n: int, keep: str): if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') + def compute(self, method: str) -> FrameOrSeriesUnion: + raise NotImplementedError + def nlargest(self): return self.compute("nlargest") @@ -1133,7 +1136,7 @@ class SelectNSeries(SelectN): nordered : Series """ - def compute(self, method): + def compute(self, method: str) -> "Series": n = self.n dtype = self.obj.dtype @@ -1207,7 +1210,7 @@ def __init__(self, obj, n: int, keep: str, columns): columns = list(columns) self.columns = columns - def compute(self, method): + def compute(self, method: str) -> "DataFrame": from pandas import Int64Index diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6d44cf917a07a..99a9e1377563c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -340,7 +340,10 @@ def wrap_results_for_axis( if self.result_type == "reduce": # e.g. test_apply_dict GH#8735 - return self.obj._constructor_sliced(results) + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + elif self.result_type is None and all( isinstance(x, dict) for x in results.values() ): diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 832d09b062265..2976747d66dfa 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -40,7 +40,7 @@ def take( fill_value = self._validate_fill_value(fill_value) new_data = take( - self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value, + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) return self._from_backing_data(new_data) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d85647edc3b81..8193d65b3b30c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1167,6 +1167,10 @@ class ExtensionOpsMixin: with NumPy arrays. """ + @classmethod + def _create_arithmetic_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_arithmetic_ops(cls): cls.__add__ = cls._create_arithmetic_method(operator.add) @@ -1186,6 +1190,10 @@ def _add_arithmetic_ops(cls): cls.__divmod__ = cls._create_arithmetic_method(divmod) cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + @classmethod + def _create_comparison_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_comparison_ops(cls): cls.__eq__ = cls._create_comparison_method(operator.eq) @@ -1195,6 +1203,10 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + @classmethod + def _create_logical_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_logical_ops(cls): cls.__and__ = cls._create_logical_method(operator.and_) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a28b341669918..27b1afdb438cb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1505,7 +1505,7 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs): return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( - self, inplace: bool = False, ascending: bool = True, na_position: str = "last", + self, inplace: bool = False, ascending: bool = True, na_position: str = "last" ): """ Sort the Categorical by category value returning a new diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 57df067c7b16e..d83ff91a1315f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -138,7 +138,7 @@ def __from_arrow__( return IntegerArray._concat_same_type(results) -def integer_array(values, dtype=None, copy: bool = False,) -> "IntegerArray": +def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": """ Infer and return an integer array of the values. @@ -182,7 +182,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( - values, dtype, mask=None, copy: bool = False, + values, dtype, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 235840d6d201e..1237dea5c1a64 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -126,7 +126,7 @@ def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: return type(self)(~self._data, self._mask) def to_numpy( - self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default, + self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default ) -> np.ndarray: """ Convert to a NumPy Array. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 05f901518d82f..23a4a70734c81 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -280,7 +280,7 @@ def isna(self) -> np.ndarray: return isna(self._ndarray) def fillna( - self, value=None, method: Optional[str] = None, limit: Optional[int] = None, + self, value=None, method: Optional[str] = None, limit: Optional[int] = None ) -> "PandasArray": # TODO(_values_for_fillna): remove this value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ddaf6d39f1837..cc39ffb5d1203 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -634,7 +634,7 @@ def _sub_period_array(self, other): return new_values def _addsub_int_array( - self, other: np.ndarray, op: Callable[[Any, Any], Any], + self, other: np.ndarray, op: Callable[[Any, Any], Any] ) -> "PeriodArray": """ Add or subtract array of integers; equivalent to applying diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d8db196e4b92f..1531f7b292365 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -234,7 +234,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'int', 'block'}, default 'int' + kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 05a5538a88772..a9c0cb0571446 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -6,6 +6,7 @@ """ import operator +from typing import List, Set import warnings import numpy as np @@ -21,7 +22,7 @@ import numexpr as ne _TEST_MODE = None -_TEST_RESULT = None +_TEST_RESULT: List[bool] = list() _USE_NUMEXPR = _NUMEXPR_INSTALLED _evaluate = None _where = None @@ -75,7 +76,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatibility - dtypes = set() + dtypes: Set[str] = set() for o in [a, b]: # Series implements dtypes, check for dimension count as well if hasattr(o, "dtypes") and o.ndim > 1: @@ -247,25 +248,28 @@ def where(cond, a, b, use_numexpr=True): return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) -def set_test_mode(v=True): +def set_test_mode(v: bool = True) -> None: """ - Keeps track of whether numexpr was used. Stores an additional ``True`` - for every successful use of evaluate with numexpr since the last - ``get_test_result`` + Keeps track of whether numexpr was used. + + Stores an additional ``True`` for every successful use of evaluate with + numexpr since the last ``get_test_result``. """ global _TEST_MODE, _TEST_RESULT _TEST_MODE = v _TEST_RESULT = [] -def _store_test_result(used_numexpr): +def _store_test_result(used_numexpr: bool) -> None: global _TEST_RESULT if used_numexpr: _TEST_RESULT.append(used_numexpr) -def get_test_result(): - """get test result and reset test_results""" +def get_test_result() -> List[bool]: + """ + Get test result and reset test_results. + """ global _TEST_RESULT res = _TEST_RESULT _TEST_RESULT = [] diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index bc9ff7c44b689..e55df1e1d8155 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -481,13 +481,21 @@ def stringify(value): self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) if ( - (self.lhs.is_scalar or self.rhs.is_scalar) + (lhs.is_scalar or rhs.is_scalar) and self.op in _bool_ops_dict and ( not ( - issubclass(self.rhs.return_type, (bool, np.bool_)) - and issubclass(self.lhs.return_type, (bool, np.bool_)) + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) ) ) ): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 47f10f1f65f4a..f145e76046bee 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -35,6 +35,7 @@ is_iterator, is_list_like, is_object_dtype, + is_sparse, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -513,9 +514,7 @@ def sanitize_array( return subarr -def _try_cast( - arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, -): +def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): """ Convert input to numpy ndarray and optionally cast to a given dtype. @@ -535,9 +534,10 @@ def _try_cast( if maybe_castable(arr) and not copy and dtype is None: return arr - if isinstance(dtype, ExtensionDtype) and dtype.kind != "M": + if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime + # DatetimeTZ case needs to go through maybe_cast_to_datetime but + # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2697f42eb05a4..e6b4cb598989b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -50,6 +50,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_sparse, is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -1323,7 +1324,9 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64 and not is_dtype_equal(dtype, DT64NS_DTYPE): + if is_datetime64 and not is_dtype_equal( + getattr(dtype, "subtype", dtype), DT64NS_DTYPE + ): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] @@ -1355,7 +1358,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_scalar(value): if value == iNaT or isna(value): value = iNaT - else: + elif not is_sparse(value): value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1587dd8798ec3..312d449e36022 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1014,7 +1014,7 @@ def iterrows(self) -> Iterable[Tuple[Label, Series]]: s = klass(v, index=columns, name=k) yield k, s - def itertuples(self, index=True, name="Pandas"): + def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"): """ Iterate over DataFrame rows as namedtuples. @@ -1088,7 +1088,11 @@ def itertuples(self, index=True, name="Pandas"): arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) if name is not None: - itertuple = collections.namedtuple(name, fields, rename=True) + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples @@ -2600,7 +2604,7 @@ def to_html( 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 188.8 MB""" + memory usage: 165.9 MB""" ), see_also_sub=( """ @@ -4591,7 +4595,7 @@ def set_index( frame = self.copy() arrays = [] - names = [] + names: List[Label] = [] if append: names = list(self.index.names) if isinstance(self.index, MultiIndex): @@ -6609,6 +6613,8 @@ def groupby( duplicate values for one index/column pair. DataFrame.unstack : Pivot based on the index values instead of a column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Notes ----- @@ -6763,6 +6769,10 @@ def pivot(self, index=None, columns=None, values=None) -> "DataFrame": -------- DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- @@ -7370,6 +7380,15 @@ def _gotitem( min 1.0 2.0 sum 12.0 NaN + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. + + >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 + Aggregate over the columns. >>> df.agg("mean", axis="columns") @@ -7409,6 +7428,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged + + # For the return values of reconstruct_func, if relabeling is + # False, columns and order will be None. + assert columns is not None + assert order is not None + result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) @@ -8641,13 +8666,11 @@ def blk_func(values): return op(values, axis=1, skipna=skipna, **kwds) # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager._reduce + # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) - assert isinstance(res, dict) - if len(res): - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) - out.index = df.columns + out = df._constructor(res,).iloc[0].rename(None) + if out_dtype is not None: + out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out @@ -9302,7 +9325,6 @@ def _AXIS_NAMES(self) -> Dict[int, str]: DataFrame._add_numeric_operations() -DataFrame._add_series_or_dataframe_operations() ops.add_flex_arithmetic_methods(DataFrame) ops.add_special_arithmetic_methods(DataFrame) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9cbe2f714fd57..3bad2d6dd18b9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,6 @@ import operator import pickle import re -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -101,17 +100,22 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME from pandas.core.shared_docs import _shared_docs +from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from pandas._libs.tslibs import BaseOffset + from pandas.core.resample import Resampler from pandas.core.series import Series # noqa: F401 + from pandas.core.window.indexers import BaseIndexer # goal is to be able to define the docs close to function, while still being # able to share +_shared_docs = {**_shared_docs} _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", @@ -315,17 +319,13 @@ def _data(self): @property def _AXIS_NUMBERS(self) -> Dict[str, int]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3) return {"index": 0} @property def _AXIS_NAMES(self) -> Dict[int, str]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3) return {0: "index"} def _construct_axes_dict(self, axes=None, **kwargs): @@ -391,7 +391,7 @@ def _get_block_manager_axis(cls, axis: Axis) -> int: return m - axis return axis - def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: + def _get_axis_resolvers(self, axis: str) -> Dict[str, Union["Series", MultiIndex]]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -421,10 +421,10 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: d[axis] = dindex return d - def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + def _get_index_resolvers(self) -> Dict[str, Union["Series", MultiIndex]]: from pandas.core.computation.parsing import clean_column_name - d: Dict[str, ABCSeries] = {} + d: Dict[str, Union["Series", MultiIndex]] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) @@ -4707,14 +4707,15 @@ def filter( return self.reindex(**{name: [r for r in items if r in labels]}) elif like: - def f(x): + def f(x) -> bool: + assert like is not None # needed for mypy return like in ensure_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: - def f(x): + def f(x) -> bool: return matcher.search(ensure_str(x)) is not None matcher = re.compile(regex) @@ -5128,50 +5129,8 @@ def pipe(self, func, *args, **kwargs): ... .pipe(g, arg1=a) ... .pipe((func, 'arg2'), arg1=a, arg3=c) ... ) # doctest: +SKIP - """ - return com.pipe(self, func, *args, **kwargs) - - _shared_docs["aggregate"] = dedent( """ - Aggregate using one or more operations over the specified axis. - {versionadded} - Parameters - ---------- - func : function, str, list or dict - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - scalar, Series or DataFrame - - The return can be: - - * scalar : when Series.agg is called with single function - * Series : when DataFrame.agg is called with a single function - * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. - {see_also} - Notes - ----- - `agg` is an alias for `aggregate`. Use the alias. - - A passed user-defined-function will be passed a Series for evaluation. - {examples}""" - ) + return com.pipe(self, func, *args, **kwargs) # ---------------------------------------------------------------------- # Attribute access @@ -5627,7 +5586,7 @@ def astype( else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) return self._constructor(new_data).__finalize__(self, method="astype") # GH 33113: handle empty frame or series @@ -6517,7 +6476,7 @@ def replace( 3 b 4 b dtype: object - """ + """ if not ( is_scalar(to_replace) or is_re_compilable(to_replace) @@ -6557,7 +6516,10 @@ def replace( regex = True items = list(to_replace.items()) - keys, values = zip(*items) if items else ([], []) + if items: + keys, values = zip(*items) + else: + keys, values = ([], []) are_mappings = [is_dict_like(v) for v in values] @@ -6893,7 +6855,7 @@ def interpolate( obj = self.T if should_transpose else self if obj.empty: - return self + return self.copy() if method not in fillna_methods: axis = self._info_axis_number @@ -7449,77 +7411,6 @@ def clip( return result - _shared_docs[ - "groupby" - ] = """ - Group %(klass)s using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the - object, applying a function, and combining the results. This can be - used to group large amounts of data and compute operations on these - groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is determine the groups. A label or list of - labels may be passed to group by the columns in ``self``. Notice - that a tuple is interpreted as a (single) key. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. - group_keys : bool, default True - When calling apply, add group keys to index to identify pieces. - squeeze : bool, default False - Reduce the dimensionality of the return type if possible, - otherwise return a consistent type. - - .. deprecated:: 1.1.0 - - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 - dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups - - .. versionadded:: 1.1.0 - - Returns - ------- - %(klass)sGroupBy - Returns a groupby object that contains information about the groups. - - See Also - -------- - resample : Convenience method for frequency conversion and resampling - of time series. - - Notes - ----- - See the `user guide - `_ for more. - """ - def asfreq( self: FrameOrSeries, freq, @@ -7769,7 +7660,7 @@ def between_time( raise TypeError("Index must be DatetimeIndex") indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, include_end=include_end, + start_time, end_time, include_start=include_start, include_end=include_end ) return self._take_with_is_copy(indexer, axis=axis) @@ -8428,35 +8319,6 @@ def ranker(data): return ranker(data) - _shared_docs[ - "compare" - ] = """ - Compare to another %(klass)s and show the differences. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - other : %(klass)s - Object to compare with. - - align_axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine which axis to align the comparison on. - - * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. - - keep_shape : bool, default False - If true, all rows and columns are kept. - Otherwise, only the ones with different values are kept. - - keep_equal : bool, default False - If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. - """ - @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, @@ -8936,7 +8798,7 @@ def _where( self._check_inplace_setting(other) new_data = self._mgr.putmask( - mask=cond, new=other, align=align, axis=block_axis, + mask=cond, new=other, align=align, axis=block_axis ) result = self._constructor(new_data) return self._update_inplace(result) @@ -10586,45 +10448,21 @@ def mad(self, axis=None, skipna=None, level=None): examples=_min_examples, ) - @classmethod - def _add_series_or_dataframe_operations(cls): - """ - Add the series or dataframe only operations to the cls; evaluate - the doc strings again. - """ - from pandas.core.window import ( - Expanding, - ExponentialMovingWindow, - Rolling, - Window, - ) - - @doc(Rolling) - def rolling( - self, - window, - min_periods=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None, - ): - axis = self._get_axis_number(axis) - - if win_type is not None: - return Window( - self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - ) + @doc(Rolling) + def rolling( + self, + window: "Union[int, timedelta, BaseOffset, BaseIndexer]", + min_periods: Optional[int] = None, + center: bool_t = False, + win_type: Optional[str] = None, + on: Optional[str] = None, + axis: Axis = 0, + closed: Optional[str] = None, + ): + axis = self._get_axis_number(axis) - return Rolling( + if win_type is not None: + return Window( self, window=window, min_periods=min_periods, @@ -10635,53 +10473,59 @@ def rolling( closed=closed, ) - cls.rolling = rolling - - @doc(Expanding) - def expanding(self, min_periods=1, center=None, axis=0): - axis = self._get_axis_number(axis) - if center is not None: - warnings.warn( - "The `center` argument on `expanding` " - "will be removed in the future", - FutureWarning, - stacklevel=2, - ) - else: - center = False + return Rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) - return Expanding(self, min_periods=min_periods, center=center, axis=axis) + @doc(Expanding) + def expanding( + self, min_periods: int = 1, center: Optional[bool_t] = None, axis: Axis = 0 + ) -> Expanding: + axis = self._get_axis_number(axis) + if center is not None: + warnings.warn( + "The `center` argument on `expanding` will be removed in the future", + FutureWarning, + stacklevel=2, + ) + else: + center = False - cls.expanding = expanding + return Expanding(self, min_periods=min_periods, center=center, axis=axis) - @doc(ExponentialMovingWindow) - def ewm( + @doc(ExponentialMovingWindow) + def ewm( + self, + com: Optional[float] = None, + span: Optional[float] = None, + halflife: Optional[Union[float, TimedeltaConvertibleTypes]] = None, + alpha: Optional[float] = None, + min_periods: int = 0, + adjust: bool_t = True, + ignore_na: bool_t = False, + axis: Axis = 0, + times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, + ) -> ExponentialMovingWindow: + axis = self._get_axis_number(axis) + return ExponentialMovingWindow( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): - axis = self._get_axis_number(axis) - return ExponentialMovingWindow( - self, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - ) - - cls.ewm = ewm + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + ) @doc(klass=_shared_doc_kwargs["klass"], axis="") def transform(self, func, *args, **kwargs): @@ -10700,7 +10544,7 @@ def transform(self, func, *args, **kwargs): - function - string function name - - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` - dict of axis labels -> functions, function names or list of such. {axis} *args diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index e71b2f94c8014..999873e7b81e4 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -4,17 +4,22 @@ SeriesGroupBy and the DataFrameGroupBy objects. """ import collections +from typing import List from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.base import PandasObject + OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) -class GroupByMixin: +class GroupByMixin(PandasObject): """ Provide the groupby facilities to the mixed object. """ + _attributes: List[str] + def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. @@ -22,7 +27,7 @@ def _gotitem(self, key, ndim, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index db734bb2f0c07..4d5acf527a867 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np from pandas.core.algorithms import unique1d @@ -6,9 +8,12 @@ CategoricalDtype, recode_for_categories, ) +from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby(c: Categorical, sort: bool, observed: bool): +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. @@ -73,7 +78,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c: Categorical, sort: bool, ci): +def recode_from_groupby( + c: Categorical, sort: bool, ci: CategoricalIndex +) -> CategoricalIndex: """ Reverse the codes_to_groupby to account for sort / observed. @@ -91,7 +98,8 @@ def recode_from_groupby(c: Categorical, sort: bool, ci): """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) + return ci.set_categories(c.categories) # type: ignore [attr-defined] # we are not sorting, so add unobserved to the end - return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) + new_cats = c.categories[~c.categories.isin(ci.categories)] + return ci.add_categories(new_cats) # type: ignore [attr-defined] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f0cdbd07560f..7b45a114e548b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,7 +9,6 @@ import copy from functools import partial from textwrap import dedent -import typing from typing import ( TYPE_CHECKING, Any, @@ -21,8 +20,8 @@ Mapping, Optional, Sequence, - Tuple, Type, + TypeVar, Union, cast, ) @@ -31,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -60,6 +59,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -70,24 +70,21 @@ GroupBy, _agg_template, _apply_docs, + _group_selection_context, _transform_template, get_groupby, ) +from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - maybe_use_numba, - split_for_numba, -) +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.plotting import boxplot_frame_groupby if TYPE_CHECKING: - from pandas.core.internals import Block + from pandas.core.internals import Block # noqa:F401 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) @@ -96,7 +93,7 @@ # TODO: validate types on ScalarResult and move to _typing # Blocked from using by https://github.com/python/mypy/issues/1484 # See note at _mangle_lambda_list -ScalarResult = typing.TypeVar("ScalarResult") +ScalarResult = TypeVar("ScalarResult") def generate_property(name: str, klass: Type[FrameOrSeries]): @@ -225,11 +222,21 @@ def _selection_name(self): def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @doc( - _agg_template, examples=_agg_examples_doc, klass="Series", - ) + @doc(_agg_template, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + if maybe_use_numba(engine): + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + with _group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor(result.ravel(), index=index, name=data.name) + relabeling = func is None columns = None if relabeling: @@ -252,16 +259,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): - # Do not catch Numba errors here, we want to raise and not fall back. # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) @@ -605,8 +607,8 @@ def filter(self, func, dropna=True, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x, *args, **kwargs) -> bool: - b = wrapper(x, *args, **kwargs) + def true_and_notna(x) -> bool: + b = wrapper(x) return b and notna(b) try: @@ -932,17 +934,22 @@ class DataFrameGroupBy(GroupBy[DataFrame]): See :ref:`groupby.aggregate.named` for more.""" ) - @doc( - _agg_template, examples=_agg_examples_doc, klass="DataFrame", - ) + @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - relabeling, func, columns, order = reconstruct_func(func, **kwargs) - if maybe_use_numba(engine): - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + with _group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs ) + return self.obj._constructor(result, index=index, columns=data.columns) + + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result, how = self._aggregate(func, *args, **kwargs) if how is None: @@ -1014,16 +1021,14 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_blocks, agg_items = self._cython_agg_blocks( + agg_mgr = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_blocks, items=agg_items) + return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> "Tuple[List[Block], Index]": - # TODO: the actual managing of mgr_locs is a PITA - # here, it should happen via BlockManager.combine + ) -> BlockManager: data: BlockManager = self._get_data_to_aggregate() @@ -1031,45 +1036,44 @@ def _cython_agg_blocks( data = data.get_numeric_data(copy=False) agg_blocks: List["Block"] = [] - deleted_items: List[np.ndarray] = [] no_result = object() - def cast_result_block(result, block: "Block", how: str) -> "Block": - # see if we can cast the block to the desired dtype + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: "Block" = block.make_block(result) - return agg_block + elif isinstance(result, np.ndarray) and result.ndim == 1: + # We went through a SeriesGroupByPath and need to reshape + result = result.reshape(1, -1) + + return result - def blk_func(block: "Block") -> List["Block"]: - new_blocks: List["Block"] = [] + def blk_func(bvalues: ArrayLike) -> ArrayLike: - result = no_result - locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=1, min_count=min_count + bvalues, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1082,58 +1086,46 @@ def blk_func(block: "Block") -> List["Block"]: assert how == "ohlc" raise + obj: Union[Series, DataFrame] # call our grouper again with only this block - obj = self.obj[data.items[locs]] - if obj.shape[1] == 1: - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = obj.iloc[:, 0] + if isinstance(bvalues, ExtensionArray): + # TODO(EA2D): special case not needed with 2D EAs + obj = Series(bvalues) + else: + obj = DataFrame(bvalues.T) + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] # Create SeriesGroupBy with observed=True so that it does # not try to add missing categories if grouping over multiple # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - try: - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - raise - else: - result = cast(DataFrame, result) - # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] - new_blocks.append(agg_block) - else: - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] - else: - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] - return new_blocks + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + + res_values = cast_agg_result(result, bvalues, how) + return res_values - skipped: List[int] = [] for i, block in enumerate(data.blocks): try: - nbs = blk_func(block) + nbs = block.apply(blk_func) except (NotImplementedError, TypeError): # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype - skipped.append(i) - deleted_items.append(block.mgr_locs.as_array) + pass else: agg_blocks.extend(nbs) @@ -1142,9 +1134,8 @@ def blk_func(block: "Block") -> List["Block"]: # reset the locs in the blocks to correspond to our # current ordering - agg_items = data.reset_dropped_locs(agg_blocks, skipped) - - return agg_blocks, agg_items + new_mgr = data._combine(agg_blocks) + return new_mgr def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: @@ -1203,57 +1194,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) + key_index = self.grouper.result_index if self.as_index else None - # reorder the values - values = [values[i] for i in indexer] - - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None - - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if isinstance(first_not_none, Series): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + backup = create_series_with_explicit_dtype( + dtype_if_empty=object, **kwargs + ) values = [x if (x is not None) else backup for x in values] @@ -1262,7 +1221,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same((x.index for x in values)) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1294,7 +1253,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1307,34 +1265,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=index, columns=columns ) + elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( @@ -1641,7 +1592,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -1652,22 +1603,37 @@ def _gotitem(self, key, ndim: int, subset=None): return DataFrameGroupBy( subset, self.grouper, - selection=key, + axis=self.axis, + level=self.level, grouper=self.grouper, exclusions=self.exclusions, + selection=key, as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, observed=self.observed, + mutated=self.mutated, + dropna=self.dropna, ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, selection=key, grouper=self.grouper, observed=self.observed + subset, + level=self.level, + grouper=self.grouper, + selection=key, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + dropna=self.dropna, ) raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj) -> DataFrame: + def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1684,20 +1650,14 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result): + def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 - izip = zip( - *map( - reversed, - ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings], - ), - ) - ) columns = result.columns - for name, lev, in_axis in izip: + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: @@ -1762,7 +1722,7 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1789,7 +1749,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func): + def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat return concat( @@ -1798,7 +1758,7 @@ def _apply_to_column_groupbys(self, func): axis=1, ) - def count(self): + def count(self) -> DataFrame: """ Compute count of group, excluding missing values. @@ -1828,7 +1788,7 @@ def count(self): return self._reindex_output(result, fill_value=0) - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> DataFrame: """ Return DataFrame with counts of unique elements in each position. @@ -1894,6 +1854,7 @@ def nunique(self, dropna: bool = True): ], axis=1, ) + results = cast(DataFrame, results) if axis_number == 1: results = results.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0047877ef78ee..651af2d314251 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -34,7 +34,7 @@ class providing the base-class of operations. from pandas._config.config import option_context -from pandas._libs import Timestamp +from pandas._libs import Timestamp, lib import pandas._libs.groupby as libgroupby from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv @@ -61,11 +61,11 @@ class providing the base-class of operations. import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, ops +from pandas.core.groupby import base, numba_, ops from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -from pandas.core.util.numba_ import maybe_use_numba +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE _common_see_also = """ See Also @@ -384,7 +384,8 @@ class providing the base-class of operations. - dict of axis labels -> functions, function names or list of such. Can also accept a Numba JIT function with - ``engine='numba'`` specified. + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. If the ``'numba'`` engine is chosen, the function must be a user defined function with ``values`` and ``index`` as the @@ -458,7 +459,7 @@ def f(self): @contextmanager -def _group_selection_context(groupby): +def _group_selection_context(groupby: "_GroupBy"): """ Set / reset the _group_selection_context. """ @@ -488,7 +489,7 @@ def __init__( keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, - grouper: "Optional[ops.BaseGrouper]" = None, + grouper: Optional["ops.BaseGrouper"] = None, exclusions=None, selection=None, as_index: bool = True, @@ -733,7 +734,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist with _group_selection_context(self): @@ -1053,12 +1054,43 @@ def _cython_agg_general( return self._wrap_aggregated_output(output, index=self.grouper.result_index) - def _python_agg_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby aggregation routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + group_keys = self.grouper._get_group_keys() + labels, _, n_groups = self.grouper.group_info + sorted_index = get_group_index_sorter(labels, n_groups) + sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + cache_key = (func, "groupby_agg") + if cache_key in NUMBA_FUNC_CACHE: + # Return an already compiled version of roll_apply if available + numba_agg_func = NUMBA_FUNC_CACHE[cache_key] + else: + numba_agg_func = numba_.generate_numba_agg_func( + tuple(args), kwargs, func, engine_kwargs + ) + result = numba_agg_func( + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + ) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_agg_func + + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(group_keys, names=self.grouper.names) + else: + index = Index(group_keys, name=self.grouper.names[0]) + return result, index + + def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) - if engine != "numba": - f = lambda x: func(x, *args, **kwargs) + f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict output: Dict[base.OutputKey, np.ndarray] = {} @@ -1069,21 +1101,11 @@ def _python_agg_general( # agg_series below assumes ngroups > 0 continue - if maybe_use_numba(engine): - result, counts = self.grouper.agg_series( - obj, - func, - *args, - engine=engine, - engine_kwargs=engine_kwargs, - **kwargs, - ) - else: - try: - # if this function is invalid for this dtype, we will ignore it. - result, counts = self.grouper.agg_series(obj, f) - except TypeError: - continue + try: + # if this function is invalid for this dtype, we will ignore it. + result, counts = self.grouper.agg_series(obj, f) + except TypeError: + continue assert result is not None key = base.OutputKey(label=name, position=idx) @@ -1573,8 +1595,7 @@ def max(self, numeric_only: bool = False, min_count: int = -1): def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): - """Helper function for first item that isn't NA. - """ + """Helper function for first item that isn't NA.""" x = x.array[notna(x.array)] if len(x) == 0: return np.nan @@ -1598,8 +1619,7 @@ def first(x: Series): def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): - """Helper function for last item that isn't NA. - """ + """Helper function for last item that isn't NA.""" x = x.array[notna(x.array)] if len(x) == 0: return np.nan diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..18970ea0544e4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -568,7 +568,9 @@ def codes(self) -> np.ndarray: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) # set in __init__ + return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index @property @@ -607,7 +609,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": +) -> Tuple["ops.BaseGrouper", List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py new file mode 100644 index 0000000000000..aebe60f797fcd --- /dev/null +++ b/pandas/core/groupby/numba_.py @@ -0,0 +1,172 @@ +"""Common utilities for Numba operations with groupby ops""" +import inspect +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import FrameOrSeries, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + NumbaUtilError, + check_kwargs_and_nopython, + get_jit_arguments, + jit_user_function, +) + + +def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: + """ + Split pandas object into its components as numpy arrays for numba functions. + + Parameters + ---------- + arg : Series or DataFrame + + Returns + ------- + (ndarray, ndarray) + values, index + """ + return arg.to_numpy(), arg.index.to_numpy() + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba with groupby ops. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +def generate_numba_func( + func: Callable, + engine_kwargs: Optional[Dict[str, bool]], + kwargs: dict, + cache_key_str: str, +) -> Tuple[Callable, Tuple[Callable, str]]: + """ + Return a JITed function and cache key for the NUMBA_FUNC_CACHE + + This _may_ be specific to groupby (as it's only used there currently). + + Parameters + ---------- + func : function + user defined function + engine_kwargs : dict or None + numba.jit arguments + kwargs : dict + kwargs for func + cache_key_str : str + string representing the second part of the cache key tuple + + Returns + ------- + (JITed function, cache key) + + Raises + ------ + NumbaUtilError + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + check_kwargs_and_nopython(kwargs, nopython) + validate_udf(func) + cache_key = (func, cache_key_str) + numba_func = NUMBA_FUNC_CACHE.get( + cache_key, jit_user_function(func, nopython, nogil, parallel) + ) + return numba_func, cache_key + + +def generate_numba_agg_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: + """ + Generate a numba jitted agg function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + validate_udf(func) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_apply( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_groups: int, + num_columns: int, + ) -> np.ndarray: + result = np.empty((num_groups, num_columns)) + for i in loop_range(num_groups): + group_index = index[begin[i] : end[i]] + for j in loop_range(num_columns): + group = values[begin[i] : end[i], j] + result[i, j] = numba_func(group, group_index, *args) + return result + + return group_apply diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 64eb413fe78fa..4dd5b7f30e7f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -55,12 +55,6 @@ get_group_index_sorter, get_indexer_dict, ) -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - maybe_use_numba, - split_for_numba, -) class BaseGrouper: @@ -88,7 +82,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence["grouper.Grouping"], sort: bool = True, group_keys: bool = True, mutated: bool = False, @@ -589,7 +583,7 @@ def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, result, counts, values, comp_ids, agg_func, min_count: int = -1, + self, result, counts, values, comp_ids, agg_func, min_count: int = -1 ): if agg_func is libgroupby.group_nth: # different signature from the others @@ -609,22 +603,10 @@ def _transform( return result - def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): + def agg_series(self, obj: Series, func: F, *args, **kwargs): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if maybe_use_numba(engine): - return self._aggregate_series_pure_python( - obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) @@ -669,21 +651,7 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): - - if maybe_use_numba(engine): - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_agg" - ) - + def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -692,13 +660,7 @@ def _aggregate_series_pure_python( splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - if maybe_use_numba(engine): - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - else: - res = func(group, *args, **kwargs) + res = func(group, *args, **kwargs) if result is None: if isinstance(res, (Series, Index, np.ndarray)): @@ -875,15 +837,7 @@ def groupings(self) -> "List[grouper.Grouping]": for lvl, name in zip(self.levels, self.names) ] - def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): + def agg_series(self, obj: Series, func: F, *args, **kwargs): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 30cc8cf480dcf..d352b001f5d2a 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -297,15 +297,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 623ce68201492..a07c3328def54 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -800,6 +800,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): deep : bool, default False dtype : numpy dtype or pandas type, optional Set dtype for new object. + + .. deprecated:: 1.2.0 + use ``astype`` method instead. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. @@ -820,6 +823,12 @@ def copy(self, name=None, deep=False, dtype=None, names=None): new_index = self._shallow_copy(name=name) if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) new_index = new_index.astype(dtype) return new_index @@ -924,7 +933,9 @@ def format( return self._format_with_header(header, na_rep=na_rep) - def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: + def _format_with_header( + self, header: List[str_t], na_rep: str_t = "NaN" + ) -> List[str_t]: from pandas.io.formats.format import format_array values = self._values @@ -3530,10 +3541,7 @@ def _join_multi(self, other, how, return_indexers=True): if not overlap: raise ValueError("cannot join with no overlapping index names") - self_is_mi = isinstance(self, ABCMultiIndex) - other_is_mi = isinstance(other, ABCMultiIndex) - - if self_is_mi and other_is_mi: + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): # Drop the non-matching levels from left and right respectively ldrop_names = list(self_names - overlap) @@ -3579,7 +3587,7 @@ def _join_multi(self, other, how, return_indexers=True): # Case where only one index is multi # make the indices into mi's that match flip_order = False - if self_is_mi: + if isinstance(self, MultiIndex): self, other = other, self flip_order = True # flip if join method is right or left diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 4990e6a8e20e9..cbb30763797d1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -347,7 +347,7 @@ def _format_attrs(self): attrs.append(("length", len(self))) return attrs - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: from pandas.io.formats.printing import pprint_thing result = [ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d9d75a69e91d..e7e93068d9175 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,7 +1,7 @@ """ Base and utility classes for tseries type pandas objects. """ -from datetime import datetime +from datetime import datetime, tzinfo from typing import Any, List, Optional, TypeVar, Union, cast import numpy as np @@ -81,9 +81,7 @@ def wrapper(left, right): DatetimeLikeArrayMixin, cache=True, ) -@inherit_names( - ["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin, -) +@inherit_names(["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin) class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. @@ -354,15 +352,20 @@ def format( """ header = [] if name: - fmt_name = ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) - header.append(fmt_name) + header.append( + ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, na_rep=na_rep, date_format=date_format) - def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: + def _format_with_header( + self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None + ) -> List[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) ) @@ -627,6 +630,8 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): but not PeriodIndex """ + tz: Optional[tzinfo] + # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing @@ -704,16 +709,16 @@ def intersection(self, other, sort=False): if result.freq is None: # TODO: no tests rely on this; needed? result = result._with_freq("infer") - assert result.name == res_name + result.name = res_name return result elif not self._can_fast_intersect(other): result = Index.intersection(self, other, sort=sort) - assert result.name == res_name # We need to invalidate the freq because Index.intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. result = result._with_freq(None)._with_freq("infer") + result.name = res_name return result # to make our life easier, "sort" the two ranges diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f71fd0d406c54..e66f513e347a9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -75,7 +75,7 @@ def _new_DatetimeIndex(cls, d): + [ method for method in DatetimeArray._datetimelike_methods - if method not in ("tz_localize",) + if method not in ("tz_localize", "tz_convert") ], DatetimeArray, wrap=True, @@ -228,6 +228,11 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # -------------------------------------------------------------------- # methods that dispatch to array and wrap result in DatetimeIndex + @doc(DatetimeArray.tz_convert) + def tz_convert(self, tz) -> "DatetimeIndex": + arr = self._data.tz_convert(tz) + return type(self)._simple_new(arr, name=self.name) + @doc(DatetimeArray.tz_localize) def tz_localize( self, tz, ambiguous="raise", nonexistent="raise" diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e8d0a44324cc5..5d309ef7cd515 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -182,10 +182,10 @@ def func(intvidx_self, other, sort=False): ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) @inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray, + ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray ) @inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True, + ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True ) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" @@ -948,7 +948,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: return header + list(self._format_native_types(na_rep=na_rep)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ffbd03d0c3ba7..b29c27982f087 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1030,7 +1030,6 @@ def _shallow_copy( name=lib.no_default, levels=None, codes=None, - dtype=None, sortorder=None, names=lib.no_default, _set_identity: bool = True, @@ -1041,7 +1040,7 @@ def _shallow_copy( names = name if name is not lib.no_default else self.names if values is not None: - assert levels is None and codes is None and dtype is None + assert levels is None and codes is None return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) levels = levels if levels is not None else self.levels @@ -1050,7 +1049,6 @@ def _shallow_copy( result = MultiIndex( levels=levels, codes=codes, - dtype=dtype, sortorder=sortorder, names=names, verify_integrity=False, @@ -1092,6 +1090,8 @@ def copy( ---------- names : sequence, optional dtype : numpy dtype or pandas type, optional + + .. deprecated:: 1.2.0 levels : sequence, optional codes : sequence, optional deep : bool, default False @@ -1117,15 +1117,24 @@ def copy( if codes is None: codes = deepcopy(self.codes) - return self._shallow_copy( + new_index = self._shallow_copy( levels=levels, codes=codes, names=names, - dtype=dtype, sortorder=self.sortorder, _set_identity=_set_identity, ) + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) + return new_index + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return self.values diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 731907993d08f..cd3f1f51a86d2 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -45,6 +45,8 @@ class NumericIndex(Index): This is an abstract class. """ + _default_dtype: np.dtype + _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None): @@ -436,7 +438,7 @@ def isin(self, values, level=None): def _is_compatible_with_other(self, other) -> bool: return super()._is_compatible_with_other(other) or all( isinstance( - obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex) ) for obj in [self, other] ) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c65c3d5ff3d9c..f1457a9aac62b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any +from typing import Any, List import warnings import numpy as np @@ -82,7 +82,7 @@ class RangeIndex(Int64Index): # Constructors def __new__( - cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None ): cls._validate_dtype(dtype) @@ -187,6 +187,15 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: + if not len(self._range): + return header + first_val_str = str(self._range[0]) + last_val_str = str(self._range[-1]) + max_length = max(len(first_val_str), len(last_val_str)) + + return header + [f"{x:<{max_length}}" for x in self._range] + # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " @@ -390,10 +399,17 @@ def _shallow_copy(self, values=None, name: Label = no_default): @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, names=None): - self._validate_dtype(dtype) - name = self._validate_names(name=name, names=names, deep=deep)[0] new_index = self._shallow_copy(name=name) + + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) return new_index def _minmax(self, meth: str): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f3286b3c20965..1b42df1b0147c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -346,6 +346,21 @@ def apply(self, func, **kwargs) -> List["Block"]: return self._split_op_result(result) + def reduce(self, func) -> List["Block"]: + # We will apply the function and reshape the result into a single-row + # Block with the same mgr_locs; squeezing will be done at a higher level + assert self.ndim == 2 + + result = func(self.values) + if np.ndim(result) == 0: + # TODO(EA2D): special case not needed with 2D EAs + res_values = np.array([[result]]) + else: + res_values = result.reshape(-1, 1) + + nb = self.make_block(res_values) + return [nb] + def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: @@ -709,7 +724,7 @@ def replace( # _can_hold_element checks have reduced this back to the # scalar case and we can avoid a costly object cast return self.replace( - to_replace[0], value, inplace=inplace, regex=regex, convert=convert, + to_replace[0], value, inplace=inplace, regex=regex, convert=convert ) # GH 22083, TypeError or ValueError occurred within error handling @@ -890,7 +905,7 @@ def setitem(self, indexer, value): return block def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a @@ -1277,7 +1292,7 @@ def shift(self, periods: int, axis: int = 0, fill_value=None): return [self.make_block(new_values)] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1351,7 +1366,7 @@ def where_func(cond, values, other): # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( - orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis ) return self._maybe_downcast(blocks, "infer") @@ -1367,7 +1382,7 @@ def where_func(cond, values, other): cond = cond.swapaxes(axis, 0) mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) - result_blocks = [] + result_blocks: List["Block"] = [] for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) @@ -1590,7 +1605,7 @@ def set(self, locs, values): self.values = values def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ See Block.putmask.__doc__ @@ -1801,7 +1816,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: return super().diff(n, axis) def shift( - self, periods: int, axis: int = 0, fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1818,7 +1833,7 @@ def shift( ] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: cond = _extract_bool_array(cond) @@ -1930,7 +1945,7 @@ def _can_hold_element(self, element: Any) -> bool: ) def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs, + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): """ convert to our native types format """ values = self.values @@ -2354,7 +2369,7 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): if not np.can_cast(to_replace_values, bool): return self return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert ) @@ -2438,18 +2453,18 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): if not either_list and is_re(to_replace): return self._replace_single( - to_replace, value, inplace=inplace, regex=True, convert=convert, + to_replace, value, inplace=inplace, regex=True, convert=convert ) elif not (either_list or regex): return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert ) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, v, inplace=inplace, regex=regex, convert=convert, + to_rep, v, inplace=inplace, regex=regex, convert=convert ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks @@ -2460,18 +2475,18 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, value, inplace=inplace, regex=regex, convert=convert, + to_rep, value, inplace=inplace, regex=regex, convert=convert ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks return self._replace_single( - to_replace, value, inplace=inplace, convert=convert, regex=regex, + to_replace, value, inplace=inplace, convert=convert, regex=regex ) def _replace_single( - self, to_replace, value, inplace=False, regex=False, convert=True, mask=None, + self, to_replace, value, inplace=False, regex=False, convert=True, mask=None ): """ Replace elements by the given value. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2c0d4931a7bf2..88839d2211f81 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,10 +1,11 @@ from collections import defaultdict import copy -from typing import List +from typing import Dict, List import numpy as np from pandas._libs import NaT, internals as libinternals +from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -29,7 +30,7 @@ def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool, + mgrs_indexers, axes, concat_axis: int, copy: bool ) -> BlockManager: """ Concatenate block managers into one. @@ -76,7 +77,7 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy,), + _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ) blocks.append(b) @@ -100,10 +101,10 @@ def _get_mgr_concatenation_plan(mgr, indexers): """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. - mgr_shape = list(mgr.shape) + mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): - mgr_shape[ax] = len(indexer) - mgr_shape = tuple(mgr_shape) + mgr_shape_list[ax] = len(indexer) + mgr_shape = tuple(mgr_shape_list) if 0 in indexers: ax0_indexer = indexers.pop(0) @@ -126,9 +127,9 @@ def _get_mgr_concatenation_plan(mgr, indexers): join_unit_indexers = indexers.copy() - shape = list(mgr_shape) - shape[0] = len(placements) - shape = tuple(shape) + shape_list = list(mgr_shape) + shape_list[0] = len(placements) + shape = tuple(shape_list) if blkno == -1: unit = JoinUnit(None, shape) @@ -339,7 +340,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): # 2D to put it a non-EA Block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis,) + concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values @@ -374,8 +375,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes = defaultdict(list) - null_upcast_classes = defaultdict(list) + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f05d4cf1c4be6..00321b76cb6bf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -330,31 +330,18 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func): + def reduce(self: T, func) -> T: # If 2D, we assume that we're operating column-wise - if self.ndim == 1: - # we'll be returning a scalar - blk = self.blocks[0] - return func(blk.values) + assert self.ndim == 2 - res = {} + res_blocks: List[Block] = [] for blk in self.blocks: - bres = func(blk.values) + nbs = blk.reduce(func) + res_blocks.extend(nbs) - if np.ndim(bres) == 0: - # EA - assert blk.shape[0] == 1 - new_res = zip(blk.mgr_locs.as_array, [bres]) - else: - assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(blk.mgr_locs.as_array, bres) - - nr = dict(new_res) - assert not any(key in res for key in nr) - res.update(nr) - - return res + index = Index([0]) # placeholder + new_mgr = BlockManager.from_blocks(res_blocks, [self.items, index]) + return new_mgr def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ @@ -504,7 +491,7 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - make_block(values, ndim=1, placement=np.arange(len(values))), axes[0], + make_block(values, ndim=1, placement=np.arange(len(values))), axes[0] ) def isna(self, func) -> "BlockManager": @@ -532,9 +519,7 @@ def where( def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) - def putmask( - self, mask, new, align: bool = True, axis: int = 0, - ): + def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose = self.ndim == 2 if align: @@ -743,7 +728,7 @@ def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_blocks = [] + new_blocks: List[Block] = [] for b in blocks: b = b.copy(deep=copy) b.mgr_locs = inv_indexer[b.mgr_locs.indexer] @@ -909,12 +894,7 @@ def to_dict(self, copy: bool = True): Returns ------- values : a dict of dtype -> BlockManager - - Notes - ----- - This consolidates based on str(dtype) """ - self._consolidate_inplace() bd: Dict[str, List[Block]] = {} for b in self.blocks: @@ -1045,6 +1025,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ + value = extract_array(value, extract_numpy=True) # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: @@ -1504,38 +1485,6 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm - def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: - """ - Decrement the mgr_locs of the given blocks with `skipped` removed. - - Notes - ----- - Alters each block's mgr_locs inplace. - """ - ncols = len(self) - - new_locs = [blk.mgr_locs.as_array for blk in blocks] - indexer = np.concatenate(new_locs) - - new_items = self.items.take(np.sort(indexer)) - - if skipped: - # we need to adjust the indexer to account for the - # items we have removed - deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] - deleted = np.concatenate(deleted_items) - ai = np.arange(ncols) - mask = np.zeros(ncols) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc - return new_items - class SingleBlockManager(BlockManager): """ manage a single block with """ @@ -1973,7 +1922,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] ): """ Raises an error if the two arrays (a,b) cannot be compared. diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index ae4892c720d5b..05f5f9a00ae1b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -11,7 +11,7 @@ BlockPairInfo = namedtuple( - "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"] ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e7e28798d84a2..e3f16a3ef4f90 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1329,7 +1329,7 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") def nancorr( - a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None ): """ a, b: ndarrays diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 4ace873f029ae..99c2fefc97ae7 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -31,7 +31,7 @@ def _make_flex_doc(op_name, typ): base_doc = _flex_doc_SERIES if op_desc["reverse"]: base_doc += _see_also_reverse_SERIES.format( - reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"] ) doc_no_examples = base_doc.format( desc=op_desc["desc"], diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e82a1d4d2cda8..fc54128ae5aa6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -795,7 +795,7 @@ def interpolate( """ Interpolate values according to different methods. """ - result = self._upsample(None) + result = self._upsample("asfreq") return result.interpolate( method=method, axis=axis, diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9e8fb643791f2..299b68c6e71e0 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -500,7 +500,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1ba6854a79265..8724f7674f0c8 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -144,14 +144,43 @@ def melt( @deprecate_kwarg(old_arg_name="label", new_arg_name=None) def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "DataFrame": """ - Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. + + Accepts a dictionary, ``groups``, in which each key is a new column name + and each value is a list of old column names that will be "melted" under + the new column name as part of the reshape. Parameters ---------- data : DataFrame + The wide-format DataFrame. groups : dict - {new_name : list_of_columns} - dropna : boolean, default True + {new_name : list_of_columns}. + dropna : bool, default True + Do not include columns whose entries are all NaN. + label : None + Not used. + + .. deprecated:: 1.0.0 + + Returns + ------- + DataFrame + Reshaped DataFrame. + + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- @@ -169,10 +198,6 @@ def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "Dat 1 Yankees 2007 573 2 Red Sox 2008 545 3 Yankees 2008 526 - - Returns - ------- - reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) @@ -262,6 +287,18 @@ def wide_to_long( A DataFrame that contains each stub name as a variable, with new index (i, j). + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + Notes ----- All extra variables are left untouched. This simply uses diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ea5916eff3afa..969ac56e41860 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -239,7 +239,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name, + table, data, values, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -308,7 +308,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins @@ -670,12 +670,11 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # keep index and column of pivoted table table_index = table.index table_columns = table.columns + last_ind_or_col = table.iloc[-1, :].name - # check if margin name is in (for MI cases) or equal to last + # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin - if (margins_name not in table.iloc[-1, :].name) | ( - margins_name != table.iloc[:, -1].name - ): + if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 391313fbb5283..e81dd8f0c735c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -81,9 +81,7 @@ class _Unstacker: unstacked : DataFrame """ - def __init__( - self, index: MultiIndex, level=-1, constructor=None, - ): + def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame @@ -422,7 +420,7 @@ def unstack(obj, level, fill_value=None): if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim, + obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( obj.values, value_columns=None, fill_value=fill_value @@ -436,7 +434,7 @@ def _unstack_frame(obj, level, fill_value=None): return obj._constructor(mgr) else: return _Unstacker( - obj.index, level=level, constructor=obj._constructor, + obj.index, level=level, constructor=obj._constructor ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) diff --git a/pandas/core/series.py b/pandas/core/series.py index e8bf87a39b572..a8a2d300fa168 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -962,12 +962,12 @@ def _get_values_tuple(self, key): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self, + self ) def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack @@ -1800,7 +1800,9 @@ def count(self, level=None): def mode(self, dropna=True) -> "Series": """ - Return the mode(s) of the dataset. + Return the mode(s) of the Series. + + The mode is the value that appears most often. There can be multiple modes. Always returns Series even if only one value is returned. @@ -4637,7 +4639,7 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage() 144 >>> s.memory_usage(deep=True) - 260 + 244 """ v = super().memory_usage(deep=deep) if index: @@ -4998,7 +5000,6 @@ def to_period(self, freq=None, copy=True) -> "Series": Series._add_numeric_operations() -Series._add_series_or_dataframe_operations() # Add arithmetic! ops.add_flex_arithmetic_methods(Series) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index b81942f062b19..0aaccb47efc44 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -2,117 +2,258 @@ _shared_docs: Dict[str, str] = dict() +_shared_docs[ + "aggregate" +] = """\ +Aggregate using one or more operations over the specified axis. +{versionadded} +Parameters +---------- +func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. +{see_also} +Notes +----- +`agg` is an alias for `aggregate`. Use the alias. + +A passed user-defined-function will be passed a Series for evaluation. +{examples}""" + +_shared_docs[ + "compare" +] = """\ +Compare to another %(klass)s and show the differences. + +.. versionadded:: 1.1.0 + +Parameters +---------- +other : %(klass)s + Object to compare with. + +align_axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine which axis to align the comparison on. + + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are aligned horizontally + with columns drawn alternately from self and other. + +keep_shape : bool, default False + If true, all rows and columns are kept. + Otherwise, only the ones with different values are kept. + +keep_equal : bool, default False + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. +""" + +_shared_docs[ + "groupby" +] = """\ +Group %(klass)s using a mapper or by a Series of columns. + +A groupby operation involves some combination of splitting the +object, applying a function, and combining the results. This can be +used to group large amounts of data and compute operations on these +groups. + +Parameters +---------- +by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If an ndarray is passed, the + values are used as-is determine the groups. A label or list of + labels may be passed to group by the columns in ``self``. Notice + that a tuple is interpreted as a (single) key. +axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). +level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. +as_index : bool, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. +sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. +group_keys : bool, default True + When calling apply, add group keys to index to identify pieces. +squeeze : bool, default False + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. + + .. deprecated:: 1.1.0 + +observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 +dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups + + .. versionadded:: 1.1.0 + +Returns +------- +%(klass)sGroupBy + Returns a groupby object that contains information about the groups. + +See Also +-------- +resample : Convenience method for frequency conversion and resampling + of time series. + +Notes +----- +See the `user guide +`_ for more. +""" _shared_docs[ "melt" -] = """ - Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. - %(versionadded)s - Parameters - ---------- - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column. - col_level : int or str, optional - If columns are a MultiIndex then use this level to melt. - ignore_index : bool, default True - If True, original index is ignored. If False, the original index is retained. - Index labels will be repeated as necessary. - - .. versionadded:: 1.1.0 - - Returns - ------- - DataFrame - Unpivoted DataFrame. - - See Also - -------- - %(other)s : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - - Examples - -------- - >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, - ... 'B': {0: 1, 1: 3, 2: 5}, - ... 'C': {0: 2, 1: 4, 2: 6}}) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> %(caller)sid_vars=['A'], value_vars=['B'], - ... var_name='myVarname', value_name='myValname') - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - - Original index values can be kept around: - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 0 a C 2 - 1 b C 4 - 2 c C 6 - - If you have multi-index columns: - - >>> df.columns = [list('ABC'), list('DEF')] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ +] = """\ +Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + +This function is useful to massage a DataFrame into a format where one +or more columns are identifier variables (`id_vars`), while all other +columns, considered measured variables (`value_vars`), are "unpivoted" to +the row axis, leaving just two non-identifier columns, 'variable' and +'value'. +%(versionadded)s +Parameters +---------- +id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. +value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. +var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. +value_name : scalar, default 'value' + Name to use for the 'value' column. +col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. +ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + + .. versionadded:: 1.1.0 + +Returns +------- +DataFrame + Unpivoted DataFrame. + +See Also +-------- +%(other)s : Identical method. +pivot_table : Create a spreadsheet-style pivot table as a DataFrame. +DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. +DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + +Examples +-------- +>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, +... 'B': {0: 1, 1: 3, 2: 5}, +... 'C': {0: 2, 1: 4, 2: 6}}) +>>> df + A B C +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +3 a C 2 +4 b C 4 +5 c C 6 + +The names of 'variable' and 'value' columns can be customized: + +>>> %(caller)sid_vars=['A'], value_vars=['B'], +... var_name='myVarname', value_name='myValname') + A myVarname myValname +0 a B 1 +1 b B 3 +2 c B 5 + +Original index values can be kept around: + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +0 a C 2 +1 b C 4 +2 c C 6 + +If you have multi-index columns: + +>>> df.columns = [list('ABC'), list('DEF')] +>>> df + A B C + D E F +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value +0 a B E 1 +1 b B E 3 +2 c B E 5 +""" diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3c1fe6bacefcf..8fcc5f74ea897 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -307,9 +307,7 @@ def _convert_listlike_datetimes( if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": - # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has - # no attribute "tz_convert" - arg = arg.tz_convert(None).tz_localize(tz) # type: ignore[union-attr] + arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg_dtype): diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 41548931f17f8..cff4695603d06 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -40,13 +40,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'int', 'signed', 'unsigned', 'float'}, default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'int' or 'signed': smallest signed int dtype (min.: np.int8) + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index c9b7943478cdd..b951cd4f0cc2a 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,12 +1,10 @@ """Common utilities for Numba operations""" from distutils.version import LooseVersion -import inspect import types from typing import Callable, Dict, Optional, Tuple import numpy as np -from pandas._typing import FrameOrSeries from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -129,94 +127,3 @@ def impl(data, *_args): return impl return numba_func - - -def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: - """ - Split pandas object into its components as numpy arrays for numba functions. - - Parameters - ---------- - arg : Series or DataFrame - - Returns - ------- - (ndarray, ndarray) - values, index - """ - return arg.to_numpy(), arg.index.to_numpy() - - -def validate_udf(func: Callable) -> None: - """ - Validate user defined function for ops when using Numba. - - The first signature arguments should include: - - def f(values, index, ...): - ... - - Parameters - ---------- - func : function, default False - user defined function - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - udf_signature = list(inspect.signature(func).parameters.keys()) - expected_args = ["values", "index"] - min_number_args = len(expected_args) - if ( - len(udf_signature) < min_number_args - or udf_signature[:min_number_args] != expected_args - ): - raise NumbaUtilError( - f"The first {min_number_args} arguments to {func.__name__} must be " - f"{expected_args}" - ) - - -def generate_numba_func( - func: Callable, - engine_kwargs: Optional[Dict[str, bool]], - kwargs: dict, - cache_key_str: str, -) -> Tuple[Callable, Tuple[Callable, str]]: - """ - Return a JITed function and cache key for the NUMBA_FUNC_CACHE - - This _may_ be specific to groupby (as it's only used there currently). - - Parameters - ---------- - func : function - user defined function - engine_kwargs : dict or None - numba.jit arguments - kwargs : dict - kwargs for func - cache_key_str : str - string representing the second part of the cache key tuple - - Returns - ------- - (JITed function, cache key) - - Raises - ------ - NumbaUtilError - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - check_kwargs_and_nopython(kwargs, nopython) - validate_udf(func) - cache_key = (func, cache_key_str) - numba_func = NUMBA_FUNC_CACHE.get( - cache_key, jit_user_function(func, nopython, nogil, parallel) - ) - return numba_func, cache_key diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 51a067427e867..2f3058db4493b 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -7,9 +7,9 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.indexes.api import MultiIndex +from pandas.core.shared_docs import _shared_docs _shared_docs = dict(**_shared_docs) _doc_template = """ diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c57c434dd3040..1913b51a68c15 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -362,7 +362,7 @@ def var(self, bias: bool = False, *args, **kwargs): def f(arg): return window_aggregations.ewmcov( - arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias, + arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias ) return self._apply(f) @@ -458,7 +458,7 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1, + x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1 ) x_values = X._prep_values() diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 7cbe34cdebf9f..a21521f4ce8bb 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -7,6 +7,8 @@ from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender +from pandas.core.dtypes.common import ensure_platform_int + from pandas.tseries.offsets import Nano get_window_bounds_doc = """ @@ -38,7 +40,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs ): """ Parameters @@ -103,7 +105,7 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array, + num_values, self.window_size, min_periods, center, closed, self.index_array ) @@ -296,9 +298,9 @@ def get_window_bounds( start_arrays = [] end_arrays = [] window_indicies_start = 0 - for key, indicies in self.groupby_indicies.items(): + for key, indices in self.groupby_indicies.items(): if self.index_array is not None: - index_array = self.index_array.take(indicies) + index_array = self.index_array.take(ensure_platform_int(indices)) else: index_array = self.index_array indexer = self.rolling_indexer( @@ -307,22 +309,22 @@ def get_window_bounds( **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( - len(indicies), min_periods, center, closed + len(indices), min_periods, center, closed ) start = start.astype(np.int64) end = end.astype(np.int64) # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indicies), + window_indicies_start, window_indicies_start + len(indices) ) - window_indicies_start += len(indicies) + window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) window_indicies = np.append( window_indicies, [window_indicies[-1] + 1] ).astype(np.int64) - start_arrays.append(window_indicies.take(start)) - end_arrays.append(window_indicies.take(end)) + start_arrays.append(window_indicies.take(ensure_platform_int(start))) + end_arrays.append(window_indicies.take(ensure_platform_int(end))) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) # GH 35552: Need to adjust start and end based on the nans appended to values diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 5d35ec7457ab0..aec294c3c84c2 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -57,7 +57,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 966773b7c6982..a3f60c0bc5098 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -6,13 +6,23 @@ from functools import partial import inspect from textwrap import dedent -from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + List, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeries, Scalar +from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -34,11 +44,12 @@ ABCSeries, ABCTimedeltaIndex, ) +from pandas.core.dtypes.missing import notna from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin import pandas.core.common as com from pandas.core.construction import extract_array -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import Index, MultiIndex from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( WindowGroupByMixin, @@ -55,6 +66,10 @@ ) from pandas.core.window.numba_ import generate_numba_apply_func +if TYPE_CHECKING: + from pandas import DataFrame, Series + from pandas.core.internals import Block # noqa:F401 + def calculate_center_offset(window) -> int: """ @@ -219,7 +234,7 @@ def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: f"get_window_bounds" ) - def _create_blocks(self, obj: FrameOrSeries): + def _create_blocks(self, obj: FrameOrSeriesUnion): """ Split data into blocks & return conformed data. """ @@ -363,77 +378,40 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: return values - def _wrap_result(self, result, block=None, obj=None): - """ - Wrap a single result. - """ - if obj is None: - obj = self._selected_obj - index = obj.index - - if isinstance(result, np.ndarray): - - if result.ndim == 1: - from pandas import Series - - return Series(result, index, name=obj.name) - - return type(obj)(result, index=index, columns=block.columns) - return result - - def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: + def _wrap_result(self, result: np.ndarray) -> "Series": """ - Wrap the results. - - Parameters - ---------- - results : list of ndarrays - blocks : list of blocks - obj : conformed data (may be resampled) - exclude: list of columns to exclude, default to None + Wrap a single 1D result. """ - from pandas import Series, concat + obj = self._selected_obj - final = [] - for result, block in zip(results, blocks): + return obj._constructor(result, obj.index, name=obj.name) - result = self._wrap_result(result, block=block, obj=obj) - if result.ndim == 1: - return result - final.append(result) + def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): + # if we have an 'on' column we want to put it back into + # the results in the same location + from pandas import Series - # if we have an 'on' column - # we want to put it back into the results - # in the same location - columns = self._selected_obj.columns if self.on is not None and not self._on.equals(obj.index): - name = self._on.name - final.append(Series(self._on, index=obj.index, name=name)) - - if self._selection is not None: - - selection = ensure_index(self._selection) - - # need to reorder to include original location of - # the on column (if its not already there) - if name not in selection: - columns = self.obj.columns - indexer = columns.get_indexer(selection.tolist() + [name]) - columns = columns.take(sorted(indexer)) - - # exclude nuisance columns so that they are not reindexed - if exclude is not None and exclude: - columns = [c for c in columns if c not in exclude] - - if not columns: - raise DataError("No numeric types to aggregate") - - if not len(final): - return obj.astype("float64") - return concat(final, axis=1).reindex(columns=columns, copy=False) + extra_col = Series(self._on, index=obj.index, name=name) + if name in result.columns: + # TODO: sure we want to overwrite results? + result[name] = extra_col + elif name in result.index.names: + pass + elif name in self._selected_obj.columns: + # insert in the same location as we had in _selected_obj + old_cols = self._selected_obj.columns + new_cols = result.columns + old_loc = old_cols.get_loc(name) + overlap = new_cols.intersection(old_cols[:old_loc]) + new_loc = len(overlap) + result.insert(new_loc, name, extra_col) + else: + # insert at the end + result[name] = extra_col - def _center_window(self, result, window) -> np.ndarray: + def _center_window(self, result: np.ndarray, window) -> np.ndarray: """ Center the result in the window. """ @@ -487,37 +465,62 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) + def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": + """ + Series version of _apply_blockwise + """ + _, obj = self._create_blocks(self._selected_obj) + + try: + values = self._prep_values(obj.values) + except (TypeError, NotImplementedError) as err: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + return obj._constructor(result, index=obj.index, name=obj.name) + def _apply_blockwise( self, homogeneous_func: Callable[..., ArrayLike] - ) -> FrameOrSeries: + ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. """ + if self._selected_obj.ndim == 1: + return self._apply_series(homogeneous_func) + # This isn't quite blockwise, since `blocks` is actually a collection # of homogenenous DataFrames. - blocks, obj = self._create_blocks(self._selected_obj) + _, obj = self._create_blocks(self._selected_obj) + mgr = obj._mgr + + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(EA2D): getattr unnecessary with 2D EAs + values = self._prep_values(getattr(bvalues, "T", bvalues)) + res_values = homogeneous_func(values) + return getattr(res_values, "T", res_values) skipped: List[int] = [] - results: List[ArrayLike] = [] - exclude: List[Scalar] = [] - for i, b in enumerate(blocks): + res_blocks: List["Block"] = [] + for i, blk in enumerate(mgr.blocks): try: - values = self._prep_values(b.values) + nbs = blk.apply(hfunc) - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - skipped.append(i) - exclude.extend(b.columns) - continue - else: - raise DataError("No numeric types to aggregate") from err + except (TypeError, NotImplementedError): + skipped.append(i) + continue - result = homogeneous_func(values) - results.append(result) + res_blocks.extend(nbs) - block_list = [blk for i, blk in enumerate(blocks) if i not in skipped] - return self._wrap_results(results, block_list, obj, exclude) + if not len(res_blocks) and skipped: + raise DataError("No numeric types to aggregate") + elif not len(res_blocks): + return obj.astype("float64") + + new_mgr = mgr._combine(res_blocks) + out = obj._constructor(new_mgr) + self._insert_on_column(out, obj) + return out def _apply( self, @@ -1278,21 +1281,29 @@ def count(self): # implementations shouldn't end up here assert not isinstance(self.window, BaseIndexer) - blocks, obj = self._create_blocks(self._selected_obj) - results = [] - for b in blocks: - result = b.notna().astype(int) + _, obj = self._create_blocks(self._selected_obj) + + def hfunc(values: np.ndarray) -> np.ndarray: + result = notna(values) + result = result.astype(int) + frame = type(obj)(result.T) result = self._constructor( - result, + frame, window=self._get_window(), min_periods=self.min_periods or 0, center=self.center, axis=self.axis, closed=self.closed, ).sum() - results.append(result) + return result.values.T - return self._wrap_results(results, blocks, obj) + new_mgr = obj._mgr.apply(hfunc) + out = obj._constructor(new_mgr) + if obj.ndim == 1: + out.name = obj.name + else: + self._insert_on_column(out, obj) + return out _shared_docs["apply"] = dedent( r""" @@ -2070,7 +2081,7 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) def apply( - self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None, + self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None ): return super().apply( func, @@ -2236,7 +2247,7 @@ def _apply( def _constructor(self): return Rolling - def _create_blocks(self, obj: FrameOrSeries): + def _create_blocks(self, obj: FrameOrSeriesUnion): """ Split data into blocks & return conformed data. """ @@ -2277,6 +2288,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ + assert isinstance(indexer_kwargs, dict) # for mypy # We'll be using the index of each group later indexer_kwargs.pop("index_array", None) elif self.is_freq_type: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b1bbda4a4b7e0..ead36c95556b1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,11 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -199,6 +200,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 Returns ------- @@ -298,10 +308,11 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -336,12 +347,14 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -640,7 +653,6 @@ def __new__(cls, path, engine=None, **kwargs): return object.__new__(cls) # declare external properties you can count on - book = None curr_sheet = None path = None @@ -837,14 +849,16 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { + _engines: Mapping[str, Any] = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__( + self, path_or_buffer, engine=None, storage_options: StorageOptions = None + ): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -858,13 +872,14 @@ def __init__(self, path_or_buffer, engine=None): raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 44abaf5d3b3c9..6cbca59aed97e 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,13 +16,19 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 0131240f99cf6..f39391ae1fe7f 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -25,7 +25,7 @@ def __init__( super().__init__(path, mode=mode, **engine_kwargs) - self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() + self.book = OpenDocumentSpreadsheet() self._style_dict: Dict[str, str] = {} def save(self) -> None: @@ -42,7 +42,7 @@ def write_cells( sheet_name: Optional[str] = None, startrow: int = 0, startcol: int = 0, - freeze_panes: Optional[List] = None, + freeze_panes: Optional[Tuple[int, int]] = None, ) -> None: """ Write the frame cells using odf @@ -215,14 +215,17 @@ def _process_style(self, style: Dict[str, Any]) -> str: self.book.styles.addElement(odf_style) return name - def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: - """Create freeze panes in the sheet + def _create_freeze_panes( + self, sheet_name: str, freeze_panes: Tuple[int, int] + ) -> None: + """ + Create freeze panes in the sheet. Parameters ---------- sheet_name : str Name of the spreadsheet - freeze_panes : list + freeze_panes : tuple of (int, int) Freeze pane location x and y """ from odf.config import ( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 03a30cbd62f9a..c2730536af8a3 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -467,7 +467,11 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +479,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..c15a52abe4d53 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,25 +1,31 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index af82c15fd6b66..a7fb519af61c6 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,10 +17,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 85a1bb031f457..bdbb006ae93dc 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -1,3 +1,5 @@ +from typing import Dict, List, Tuple + import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter @@ -8,7 +10,7 @@ class _XlsxStyler: # Map from openpyxl-oriented styles to flatter xlsxwriter representation # Ordering necessary for both determinism and because some are keyed by # prefixes of others. - STYLE_MAPPING = { + STYLE_MAPPING: Dict[str, List[Tuple[Tuple[str, ...], str]]] = { "font": [ (("name",), "font_name"), (("sz",), "font_size"), @@ -170,7 +172,7 @@ def __init__( **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. - import xlsxwriter + from xlsxwriter import Workbook if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") @@ -184,7 +186,7 @@ def __init__( **engine_kwargs, ) - self.book = xlsxwriter.Workbook(path, **engine_kwargs) + self.book = Workbook(path, **engine_kwargs) def save(self): """ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2c664e73b9463..fb606b5ec8aef 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,5 +1,6 @@ """ feather-format compat """ +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -7,7 +8,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options=None, **kwargs): +def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -15,14 +16,13 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): ---------- df : DataFrame path : string file path, or file-like object - storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -77,7 +77,9 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. @@ -103,6 +105,15 @@ def read_feather(path, columns=None, use_threads: bool = True, storage_options=N Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index b40d2a57b8106..4d6f03489725f 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -20,9 +20,7 @@ def expand(self, prop, value: str): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn( - f'Could not expand "{prop}: {value}"', CSSWarning, - ) + warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] @@ -117,10 +115,7 @@ def __call__(self, declarations_str, inherited=None): props[prop] = self.size_to_pt( props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS ) - for prop in [ - f"margin-{side}", - f"padding-{side}", - ]: + for prop in [f"margin-{side}", f"padding-{side}"]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 81990b3d505e1..461ef6823918e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -80,7 +80,7 @@ FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]], + str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] ] common_docstring = """ @@ -741,7 +741,7 @@ def _to_str_columns(self) -> List[List[str]]: for i, c in enumerate(frame): fmt_values = self._format_col(i) fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj, + fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj ) stringified.append(fmt_values) else: @@ -1069,7 +1069,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: fmt_index = [ tuple( _make_fixed_width( - list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj, + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj ) ) for x in fmt_index diff --git a/pandas/io/orc.py b/pandas/io/orc.py index ea79efd0579e5..b556732e4d116 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -12,7 +12,7 @@ def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs ) -> "DataFrame": """ Load an ORC object from the file path, returning a DataFrame. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5d49757ce7d58..983aa56324083 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, Union +from pandas._typing import FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options=None, + storage_options: StorageOptions = None, ): # gh-23761 # diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 53ef97bbe9a72..b33daf39de37c 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -294,7 +294,7 @@ def maybe_color_bp(bp, **kwds): def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] - values = [np.asarray(remove_na_arraylike(v)) for v in values] + values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values] bp = ax.boxplot(values, **kwds) if fontsize is not None: ax.tick_params(axis="both", labelsize=fontsize) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 8f2080658e63e..3db7c38eced65 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -1,7 +1,8 @@ import contextlib import datetime as pydt -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo import functools +from typing import Any, List, Optional, Tuple from dateutil.relativedelta import relativedelta import matplotlib.dates as dates @@ -143,7 +144,7 @@ def convert(value, unit, axis): return value @staticmethod - def axisinfo(unit, axis): + def axisinfo(unit, axis) -> Optional[units.AxisInfo]: if unit != "time": return None @@ -152,7 +153,7 @@ def axisinfo(unit, axis): return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @staticmethod - def default_units(x, axis): + def default_units(x, axis) -> str: return "time" @@ -293,7 +294,7 @@ def try_parse(values): return values @staticmethod - def axisinfo(unit, axis): + def axisinfo(unit: Optional[tzinfo], axis) -> units.AxisInfo: """ Return the :class:`~matplotlib.units.AxisInfo` for *unit*. @@ -421,7 +422,7 @@ def autoscale(self): return self.nonsingular(vmin, vmax) -def _from_ordinal(x, tz=None): +def _from_ordinal(x, tz: Optional[tzinfo] = None) -> datetime: ix = int(x) dt = datetime.fromordinal(ix) remainder = float(x) - ix @@ -450,7 +451,7 @@ def _from_ordinal(x, tz=None): # ------------------------------------------------------------------------- -def _get_default_annual_spacing(nyears): +def _get_default_annual_spacing(nyears) -> Tuple[int, int]: """ Returns a default spacing between consecutive ticks for annual data. """ @@ -472,7 +473,7 @@ def _get_default_annual_spacing(nyears): return (min_spacing, maj_spacing) -def period_break(dates, period): +def period_break(dates: PeriodIndex, period: str) -> np.ndarray: """ Returns the indices where the given period changes. @@ -488,7 +489,7 @@ def period_break(dates, period): return np.nonzero(current - previous)[0] -def has_level_label(label_flags, vmin): +def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: """ Returns true if the ``label_flags`` indicate there is at least one label for this level. @@ -983,18 +984,24 @@ class TimeSeries_DateFormatter(Formatter): ---------- freq : {int, string} Valid frequency specifier. - minor_locator : {False, True} + minor_locator : bool, default False Whether the current formatter should apply to minor ticks (True) or major ticks (False). - dynamic_mode : {True, False} + dynamic_mode : bool, default True Whether the formatter works in dynamic mode or not. """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): + def __init__( + self, + freq, + minor_locator: bool = False, + dynamic_mode: bool = True, + plot_obj=None, + ): freq = to_offset(freq) self.format = None self.freq = freq - self.locs = [] + self.locs: List[Any] = [] # unused, for matplotlib compat self.formatdict = None self.isminor = minor_locator self.isdynamic = dynamic_mode diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b490e07e43753..4d23a5e5fc249 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional import warnings from matplotlib.artist import Artist @@ -43,6 +43,9 @@ table, ) +if TYPE_CHECKING: + from matplotlib.axes import Axes + class MPLPlot: """ @@ -1147,7 +1150,7 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): return lines @classmethod - def _ts_plot(cls, ax, x, data, style=None, **kwds): + def _ts_plot(cls, ax: "Axes", x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( _decorate_axes, _maybe_resample, diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index eef4276f0ed09..fd89a093d25a4 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,14 +24,15 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Index, Series # noqa:F401 + from matplotlib.axes import Axes + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- # Plotting functions and monkey patches -def _maybe_resample(series: "Series", ax, kwargs): +def _maybe_resample(series: "Series", ax: "Axes", kwargs): # resample against axes freq if necessary freq, ax_freq = _get_freq(ax, series) @@ -62,19 +63,19 @@ def _maybe_resample(series: "Series", ax, kwargs): return freq, series -def _is_sub(f1, f2): +def _is_sub(f1: str, f2: str) -> bool: return (f1.startswith("W") and is_subperiod("D", f2)) or ( f2.startswith("W") and is_subperiod(f1, "D") ) -def _is_sup(f1, f2): +def _is_sup(f1: str, f2: str) -> bool: return (f1.startswith("W") and is_superperiod("D", f2)) or ( f2.startswith("W") and is_superperiod(f1, "D") ) -def _upsample_others(ax, freq, kwargs): +def _upsample_others(ax: "Axes", freq, kwargs): legend = ax.get_legend() lines, labels = _replot_ax(ax, freq, kwargs) _replot_ax(ax, freq, kwargs) @@ -97,7 +98,7 @@ def _upsample_others(ax, freq, kwargs): ax.legend(lines, labels, loc="best", title=title) -def _replot_ax(ax, freq, kwargs): +def _replot_ax(ax: "Axes", freq, kwargs): data = getattr(ax, "_plot_data", None) # clear current axes and data @@ -127,7 +128,7 @@ def _replot_ax(ax, freq, kwargs): return lines, labels -def _decorate_axes(ax, freq, kwargs): +def _decorate_axes(ax: "Axes", freq, kwargs): """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): ax._plot_data = [] @@ -143,7 +144,7 @@ def _decorate_axes(ax, freq, kwargs): ax.date_axis_info = None -def _get_ax_freq(ax): +def _get_ax_freq(ax: "Axes"): """ Get the freq attribute of the ax object if set. Also checks shared axes (eg when using secondary yaxis, sharex=True @@ -174,7 +175,7 @@ def _get_period_alias(freq) -> Optional[str]: return freq -def _get_freq(ax, series: "Series"): +def _get_freq(ax: "Axes", series: "Series"): # get frequency from data freq = getattr(series.index, "freq", None) if freq is None: @@ -192,7 +193,7 @@ def _get_freq(ax, series: "Series"): return freq, ax_freq -def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool: +def _use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) @@ -234,7 +235,7 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: return freq -def _maybe_convert_index(ax, data): +def _maybe_convert_index(ax: "Axes", data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): @@ -264,7 +265,7 @@ def _maybe_convert_index(ax, data): # Do we need the rest for convenience? -def _format_coord(freq, t, y): +def _format_coord(freq, t, y) -> str: time_period = Period(ordinal=int(t), freq=freq) return f"t = {time_period} y = {y:8f}" diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index caf2f27de9276..4d643ffb734e4 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -1,18 +1,27 @@ # being a bit too dynamic from math import ceil +from typing import TYPE_CHECKING, Iterable, List, Sequence, Tuple, Union import warnings import matplotlib.table import matplotlib.ticker as ticker import numpy as np +from pandas._typing import FrameOrSeries + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.plotting._matplotlib import compat +if TYPE_CHECKING: + from matplotlib.axes import Axes + from matplotlib.axis import Axis + from matplotlib.lines import Line2D # noqa:F401 + from matplotlib.table import Table + -def format_date_labels(ax, rot): +def format_date_labels(ax: "Axes", rot): # mini version of autofmt_xdate for label in ax.get_xticklabels(): label.set_ha("right") @@ -21,7 +30,7 @@ def format_date_labels(ax, rot): fig.subplots_adjust(bottom=0.2) -def table(ax, data, rowLabels=None, colLabels=None, **kwargs): +def table(ax, data: FrameOrSeries, rowLabels=None, colLabels=None, **kwargs) -> "Table": if isinstance(data, ABCSeries): data = data.to_frame() elif isinstance(data, ABCDataFrame): @@ -43,7 +52,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): return table -def _get_layout(nplots, layout=None, layout_type="box"): +def _get_layout(nplots: int, layout=None, layout_type: str = "box") -> Tuple[int, int]: if layout is not None: if not isinstance(layout, (tuple, list)) or len(layout) != 2: raise ValueError("Layout must be a tuple of (rows, columns)") @@ -92,14 +101,14 @@ def _get_layout(nplots, layout=None, layout_type="box"): def _subplots( - naxes=None, - sharex=False, - sharey=False, - squeeze=True, + naxes: int, + sharex: bool = False, + sharey: bool = False, + squeeze: bool = True, subplot_kw=None, ax=None, layout=None, - layout_type="box", + layout_type: str = "box", **fig_kw, ): """ @@ -272,7 +281,7 @@ def _subplots( return fig, axes -def _remove_labels_from_axis(axis): +def _remove_labels_from_axis(axis: "Axis"): for t in axis.get_majorticklabels(): t.set_visible(False) @@ -288,7 +297,15 @@ def _remove_labels_from_axis(axis): axis.get_label().set_visible(False) -def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): +def _handle_shared_axes( + axarr: Iterable["Axes"], + nplots: int, + naxes: int, + nrows: int, + ncols: int, + sharex: bool, + sharey: bool, +): if nplots > 1: if compat._mpl_ge_3_2_0(): row_num = lambda x: x.get_subplotspec().rowspan.start @@ -334,7 +351,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): _remove_labels_from_axis(ax.yaxis) -def _flatten(axes): +def _flatten(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, ABCIndexClass)): @@ -342,7 +359,13 @@ def _flatten(axes): return np.array(axes) -def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): +def _set_ticks_props( + axes: Union["Axes", Sequence["Axes"]], + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, +): import matplotlib.pyplot as plt for ax in _flatten(axes): @@ -357,7 +380,7 @@ def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=Non return axes -def _get_all_lines(ax): +def _get_all_lines(ax: "Axes") -> List["Line2D"]: lines = ax.get_lines() if hasattr(ax, "right_ax"): @@ -369,7 +392,7 @@ def _get_all_lines(ax): return lines -def _get_xlim(lines): +def _get_xlim(lines: Iterable["Line2D"]) -> Tuple[float, float]: left, right = np.inf, -np.inf for l in lines: x = l.get_xdata(orig=False) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index b9219f9f833de..bbfaacae1b444 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -348,6 +348,12 @@ def test_fillna_frame(self, data_missing): # Non-scalar "scalar" values. super().test_fillna_frame(data_missing) + @pytest.mark.skip("Invalid test") + def test_fillna_fill_other(self, data): + # inplace update doesn't work correctly with patched extension arrays + # extract_array returns PandasArray, while dtype is a numpy dtype + super().test_fillna_fill_other(data_missing) + class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): @pytest.mark.skip("Incorrect parent test") diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 538978358c8e7..5a1e448beb40f 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1541,3 +1541,12 @@ def func(row): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, result) + + +def test_apply_empty_list_reduce(): + # GH#35683 get columns correct + df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) + + result = df.apply(lambda x: [], result_type="reduce") + expected = pd.Series({"a": [], "b": []}, dtype=object) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 3c9d79397e4bd..6b86a13fcf1b9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -38,6 +38,7 @@ def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() + assert result is not df expected = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8603bff0587b6..83dfd42ae2a6e 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1581,3 +1581,10 @@ def test_replace_with_compiled_regex(self): result = df.replace({regex: "z"}, regex=True) expected = pd.DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + + def test_replace_intervals(self): + # https://github.com/pandas-dev/pandas/issues/35931 + df = pd.DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = pd.DataFrame({"a": ["x", "x"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2fb1f7f911a9c..0716cf5e27119 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark, skip_if_no import pandas as pd @@ -521,6 +522,7 @@ def _check_f(base, f): _check_f(d.copy(), f) @async_mark() + @td.check_file_leaks async def test_tab_complete_warning(self, ip): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c9fec3215d57f..00cfa6265934f 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -626,3 +626,35 @@ def test_add_column_with_pandas_array(self): assert type(df["c"]._mgr.blocks[0]) == ObjectBlock assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock tm.assert_frame_equal(df, df2) + + +def test_to_dict_of_blocks_item_cache(): + # Calling to_dict_of_blocks should not poison item_cache + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) + mgr = df._mgr + assert len(mgr.blocks) == 3 # i.e. not consolidated + + ser = df["b"] # populations item_cache["b"] + + df._to_dict_of_blocks() + + # Check that the to_dict_of_blocks didnt break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + + assert df["b"] is ser + + +def test_update_inplace_sets_valid_block_values(): + # https://github.com/pandas-dev/pandas/issues/33457 + df = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="category")}) + + # inplace update of a single column + df["a"].fillna(1, inplace=True) + + # check we havent put a Series into any block.values + assert isinstance(df._mgr.blocks[0].values, pd.Categorical) + + # smoketest for OP bug from GH#35731 + assert df.isnull().sum().sum() == 0 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 628b955a1de92..56d178daee7fd 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -160,6 +160,13 @@ def test_eval_resolvers_as_list(self): assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + def test_eval_object_dtype_binop(self): + # GH#24883 + df = pd.DataFrame({"a1": ["Y", "N"]}) + res = df.eval("c = ((a1 == 'Y') & True)") + expected = pd.DataFrame({"a1": ["Y", "N"], "c": [True, False]}) + tm.assert_frame_equal(res, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ce9d4b892d775..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,6 +1063,85 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data=exp_data, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 690694b0e66f5..29e65e938f6f9 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, option_context +from pandas import DataFrame, NamedAgg, option_context import pandas._testing as tm from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -128,3 +128,25 @@ def func_1(values, index): with option_context("compute.use_numba", True): result = grouped.agg(func_1, engine=None) tm.assert_frame_equal(expected, result) + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.parametrize( + "agg_func", + [ + ["min", "max"], + "min", + {"B": ["min", "max"], "C": "sum"}, + NamedAgg(column="B", aggfunc="min"), + ], +) +def test_multifunc_notimplimented(agg_func): + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + ) + grouped = data.groupby(0) + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped.agg(agg_func, engine="numba") + + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped[1].agg(agg_func, engine="numba") diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee38722ffb8ce..a1dcb28a32c6c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -861,13 +861,14 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8c51ebf89f5c0..c743058c988b4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2069,3 +2069,45 @@ def test_group_on_two_row_multiindex_returns_one_tuple_key(): assert len(result) == 1 key = (1, 2) assert (result[key] == expected[key]).all() + + +@pytest.mark.parametrize( + "klass, attr, value", + [ + (DataFrame, "axis", 1), + (DataFrame, "level", "a"), + (DataFrame, "as_index", False), + (DataFrame, "sort", False), + (DataFrame, "group_keys", False), + (DataFrame, "squeeze", True), + (DataFrame, "observed", True), + (DataFrame, "dropna", False), + pytest.param( + Series, + "axis", + 1, + marks=pytest.mark.xfail( + reason="GH 35443: Attribute currently not passed on to series" + ), + ), + (Series, "level", "a"), + (Series, "as_index", False), + (Series, "sort", False), + (Series, "group_keys", False), + (Series, "squeeze", True), + (Series, "observed", True), + (Series, "dropna", False), + ], +) +@pytest.mark.filterwarnings( + "ignore:The `squeeze` parameter is deprecated:FutureWarning" +) +def test_subsetting_columns_keeps_attrs(klass, attr, value): + # GH 9959 - When subsetting columns, don't drop attributes + df = pd.DataFrame({"a": [1], "b": [2], "c": [3]}) + if attr != "axis": + df = df.set_index("a") + + expected = df.groupby("a", **{attr: value}) + result = expected[["b"]] if klass is DataFrame else expected["b"] + assert getattr(result, attr) == getattr(expected, attr) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 98f7c0eadb4bb..e95e7267f17ec 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,5 +1,5 @@ import gc -from typing import Optional, Type +from typing import Type import numpy as np import pytest @@ -33,7 +33,7 @@ class Base: """ base class for index sub-class tests """ - _holder: Optional[Type[Index]] = None + _holder: Type[Index] _compat_props = ["shape", "ndim", "size", "nbytes"] def create_index(self) -> Index: @@ -270,7 +270,7 @@ def test_copy_name(self, index): s3 = s1 * s2 assert s3.index.name == "mario" - def test_name2(self, index): + def test_copy_name2(self, index): # gh-35592 if isinstance(index, MultiIndex): return @@ -284,6 +284,11 @@ def test_name2(self, index): with pytest.raises(TypeError, match=msg): index.copy(name=[["mario"]]) + def test_copy_dtype_deprecated(self, index): + # GH35853 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + index.copy(dtype=object) + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 @@ -681,6 +686,12 @@ def test_format(self): expected = [str(x) for x in idx] assert idx.format() == expected + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([]) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 6670b079ddd29..f19e78323ab23 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -470,6 +470,13 @@ def test_intersection_bug(self): tm.assert_index_equal(result, b) assert result.freq == b.freq + def test_intersection_list(self): + # GH#35876 + values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] + idx = pd.DatetimeIndex(values, name="a") + res = idx.intersection(values) + tm.assert_index_equal(res, idx) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 473e370c76f8b..508bd2f566507 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -228,6 +228,12 @@ def test_take_fill_value_ints(self, klass): class TestContains: + @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index]) + def test_contains_none(self, klass): + # GH#35788 should return False, not raise TypeError + index = klass([0, 1, 2, 3, 4]) + assert None not in index + def test_contains_float64_nans(self): index = Float64Index([1.0, 2.0, np.nan]) assert np.nan in index diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 15a88ab3819ce..085d41aaa5b76 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -536,6 +536,12 @@ def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): with pytest.raises(KeyError, match=msg): df.loc[key] + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([], freq="A") + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index c4c242746e92c..172cd4a106ac1 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -171,8 +171,14 @@ def test_cache(self): pass assert idx._cache == {} + idx.format() + assert idx._cache == {} + df = pd.DataFrame({"a": range(10)}, index=idx) + str(df) + assert idx._cache == {} + df.loc[50] assert idx._cache == {} @@ -515,3 +521,9 @@ def test_engineless_lookup(self): idx.get_loc("a") assert "_engine" not in idx._cache + + def test_format_empty(self): + # GH35712 + empty_idx = self._holder(0) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 70eb9e502f78a..aee4b16621b4d 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -62,11 +62,6 @@ def test_new_axis(self, index): assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) - def test_copy_and_deepcopy(self, index): - new_copy2 = index.copy(dtype=int) - assert new_copy2.dtype.kind == "i" - def test_constructor_regular(self, index): tm.assert_contains_all(index, index) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 02a173eb4958d..db260b71e7186 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -374,8 +374,7 @@ def test_has_duplicates(self, index): "dtype", ["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"], ) - @pytest.mark.parametrize("copy", [True, False]) - def test_astype_preserves_name(self, index, dtype, copy): + def test_astype_preserves_name(self, index, dtype): # https://github.com/pandas-dev/pandas/issues/32013 if isinstance(index, MultiIndex): index.names = ["idx" + str(i) for i in range(index.nlevels)] @@ -384,10 +383,7 @@ def test_astype_preserves_name(self, index, dtype, copy): try: # Some of these conversions cannot succeed so we use a try / except - if copy: - result = index.copy(dtype=dtype) - else: - result = index.astype(dtype) + result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): return diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index bfcac5d433d2c..e6f455e60eee3 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -394,7 +394,7 @@ def test_identical(self): same_values_different_type = Index(i, dtype=object) assert not i.identical(same_values_different_type) - i = index.copy(dtype=object) + i = index.astype(dtype=object) i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) @@ -402,7 +402,7 @@ def test_identical(self): assert not i.identical(index) assert Index(same_values, name="foo", dtype=object).identical(i) - assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype)) + assert not index.astype(dtype=object).identical(index.astype(dtype=self._dtype)) def test_union_noncomparable(self): # corner case, non-Int64Index diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fcee25c258efa..193baa8c3ed74 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,7 @@ import os +import shlex +import subprocess +import time import pytest @@ -31,10 +34,65 @@ def feather_file(datapath): @pytest.fixture -def s3_resource(tips_file, jsonl_file, feather_file): +def s3so(worker_id): + worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") + return dict(client_kwargs={"endpoint_url": f"http://127.0.0.1:555{worker_id}/"}) + + +@pytest.fixture(scope="session") +def s3_base(worker_id): """ Fixture for mocking S3 interaction. + Sets up moto server in separate process + """ + pytest.importorskip("s3fs") + pytest.importorskip("boto3") + requests = pytest.importorskip("requests") + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + + pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("flask") # server mode needs flask too + + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + + worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") + endpoint_port = f"555{worker_id}" + endpoint_uri = f"http://127.0.0.1:{endpoint_port}/" + + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL + ) + + timeout = 5 + while timeout > 0: + try: + # OK to go once server is accepting connections + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) + yield endpoint_uri + + proc.terminate() + proc.wait() + + +@pytest.fixture() +def s3_resource(s3_base, tips_file, jsonl_file, feather_file): + """ + Sets up S3 bucket with contents + The primary bucket name is "pandas-test". The following datasets are loaded. @@ -46,45 +104,58 @@ def s3_resource(tips_file, jsonl_file, feather_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - s3fs = pytest.importorskip("s3fs") - boto3 = pytest.importorskip("boto3") - - with tm.ensure_safe_environment_variables(): - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 - os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") - os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - - moto = pytest.importorskip("moto") - - test_s3_files = [ - ("tips#1.csv", tips_file), - ("tips.csv", tips_file), - ("tips.csv.gz", tips_file + ".gz"), - ("tips.csv.bz2", tips_file + ".bz2"), - ("items.jsonl", jsonl_file), - ("simple_dataset.feather", feather_file), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, "rb") as f: - conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) - - try: - s3 = moto.mock_s3() - s3.start() - - # see gh-16135 - bucket = "pandas-test" - conn = boto3.resource("s3", region_name="us-east-1") - - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket="cant_get_it", ACL="private") - add_tips_files("cant_get_it") - s3fs.S3FileSystem.clear_instance_cache() - yield conn - finally: - s3.stop() + import boto3 + import s3fs + + test_s3_files = [ + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), + ("simple_dataset.feather", feather_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, "rb") as f: + cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) + + bucket = "pandas-test" + conn = boto3.resource("s3", endpoint_url=s3_base) + cli = boto3.client("s3", endpoint_url=s3_base) + + try: + cli.create_bucket(Bucket=bucket) + except: # noqa + # OK is bucket already exists + pass + try: + cli.create_bucket(Bucket="cant_get_it", ACL="private") + except: # noqa + # OK is bucket already exists + pass + timeout = 2 + while not cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 + + add_tips_files(bucket) + add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() + yield conn + + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) + + try: + s3.rm(bucket, recursive=True) + except: # noqa + pass + try: + s3.rm("cant_get_it", recursive=True) + except: # noqa + pass + timeout = 2 + while cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 51fbbf836a03f..431a50477fccc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -606,13 +606,14 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - url_table = pd.read_excel(url) + + url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 84805d06df4a8..1bbfe4d7d74af 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -19,6 +19,7 @@ import pytz from pandas.compat import is_platform_32bit, is_platform_windows +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -3338,6 +3339,7 @@ def test_format_percentiles_integer_idx(): assert result == expected +@td.check_file_leaks def test_repr_html_ipython_config(ip): code = textwrap.dedent( """\ diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 182c21ed1d416..c0e3220454bf1 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -34,7 +34,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale -def test_with_s3_url(compression, s3_resource): +def test_with_s3_url(compression, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') @@ -44,7 +44,9 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + roundtripped_df = pd.read_json( + "s3://pandas-test/test-1", compression=compression, storage_options=s3so, + ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1280d0fd434d5..2022abbaee323 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,10 +1213,12 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource): + def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 - result = read_json("s3n://pandas-test/items.jsonl", lines=True) + result = read_json( + "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1700,13 +1702,15 @@ def test_json_multiindex(self, dataframe, expected): result = series.to_json(orient="index") assert result == expected - def test_to_s3(self, s3_resource): + def test_to_s3(self, s3_resource, s3so): import time # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}") + df.to_json( + f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so, + ) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b30a7b1ef34de..b8b03cbd14a1d 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -71,50 +71,62 @@ def tips_df(datapath): @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df): + def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv") + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df): + def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df): + def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df): + def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df): + def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -126,7 +138,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df): + def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -135,6 +147,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -145,46 +158,53 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df): + def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df): + def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df): + def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self): + def test_read_s3_fails(self, s3so): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv") + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df): + def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -195,10 +215,12 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df): + def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -209,7 +231,10 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -225,7 +250,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog): + def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -245,18 +270,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5) + read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df): + def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv") + result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file): + def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather") + res = read_feather( + "s3://pandas-test/simple_dataset.feather", storage_options=s3so + ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 3e89f6ca4ae16..666da677d702e 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -131,27 +131,38 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) +def test_from_s3_csv(s3_resource, tips_file, s3so): + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) + ) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), + read_csv(tips_file), + ) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), + read_csv(tips_file), + ) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol): +def test_s3_protocols(s3_resource, tips_file, protocol, s3so): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), + read_csv(tips_file), ) @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource): +def test_s3_parquet(s3_resource, s3so): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) - df2 = read_parquet(fn, engine="fastparquet") + df1.to_parquet( + fn, index=False, engine="fastparquet", compression=None, storage_options=s3so + ) + df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..15f9837176315 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -537,9 +537,11 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem() + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() + s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( df_compat, @@ -549,27 +551,51 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): write_kwargs=kw, ) - def test_s3_roundtrip(self, df_compat, s3_resource, pa): + def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() # GH #19134 - check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") + s3so = dict(storage_options=s3so) + check_round_trip( + df_compat, + pa, + path="s3://pandas-test/pyarrow.parquet", + read_kwargs=s3so, + write_kwargs=s3so, + ) @td.skip_if_no("s3fs") @pytest.mark.parametrize("partition_col", [["A"], []]) - def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): + def test_s3_roundtrip_for_dir( + self, df_compat, s3_resource, pa, partition_col, s3so + ): # GH #26388 - # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 - # As per pyarrow partitioned columns become 'categorical' dtypes - # and are added to back of dataframe on read - expected_df = df_compat.copy() - if partition_col: - expected_df[partition_col] = expected_df[partition_col].astype("category") + + # GH #35791 + # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 + # Previous behaviour was pyarrow partitioned columns become 'category' dtypes + # These are added to back of dataframe on read. In new API category dtype is + # only used if partition field is string. + legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") + if partition_col and legacy_read_table: + partition_col_type = "category" + else: + partition_col_type = "int32" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) + check_round_trip( df_compat, pa, expected=expected_df, path="s3://pandas-test/parquet_dir", - write_kwargs={"partition_cols": partition_col, "compression": None}, + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict( + partition_cols=partition_col, compression=None, storage_options=s3so + ), check_like=True, repeat=1, ) @@ -743,9 +769,15 @@ def test_filter_row_groups(self, fp): result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 - def test_s3_roundtrip(self, df_compat, s3_resource, fp): + def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so): # GH #19134 - check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") + check_round_trip( + df_compat, + fp, + path="s3://pandas-test/fastparquet.parquet", + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict(compression=None, storage_options=s3so), + ) def test_partition_cols_supported(self, fp, df_full): # GH #23283 diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index b36b11582c1ec..73bf7dafac254 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,10 +3,11 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark import pandas as pd -from pandas import DataFrame, Series, Timestamp, compat +from pandas import DataFrame, Series, Timestamp import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -17,6 +18,7 @@ @async_mark() +@td.check_file_leaks async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter @@ -317,7 +319,6 @@ def test_resample_groupby_with_label(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(not compat.IS64, reason="GH-35148") def test_consistency_with_window(): # consistent return values with window diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 26e429c47b494..f638706207679 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -287,3 +287,65 @@ def test_upsample_sum(method, method_args, expected_values): result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) + + +def test_groupby_resample_interpolate(): + # GH 35325 + d = {"price": [10, 11, 9], "volume": [50, 60, 50]} + + df = pd.DataFrame(d) + + df["week_starting"] = pd.date_range("01/01/2018", periods=3, freq="W") + + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) + expected_ind = pd.MultiIndex.from_tuples( + [ + (50, "2018-01-07"), + (50, pd.Timestamp("2018-01-08")), + (50, pd.Timestamp("2018-01-09")), + (50, pd.Timestamp("2018-01-10")), + (50, pd.Timestamp("2018-01-11")), + (50, pd.Timestamp("2018-01-12")), + (50, pd.Timestamp("2018-01-13")), + (50, pd.Timestamp("2018-01-14")), + (50, pd.Timestamp("2018-01-15")), + (50, pd.Timestamp("2018-01-16")), + (50, pd.Timestamp("2018-01-17")), + (50, pd.Timestamp("2018-01-18")), + (50, pd.Timestamp("2018-01-19")), + (50, pd.Timestamp("2018-01-20")), + (50, pd.Timestamp("2018-01-21")), + (60, pd.Timestamp("2018-01-14")), + ], + names=["volume", "week_starting"], + ) + expected = pd.DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ], + "volume": [50.0] * 15 + [60], + }, + index=expected_ind, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 0fbb60c176b30..3fa85e62d028c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): tm.assert_index_equal(result.index, expected_index) assert result.index.freq == expected_index.freq assert not np.isnan(result[-1]) + + +def test_resample_with_timedelta_yields_no_empty_groups(): + # GH 10603 + df = pd.DataFrame( + np.random.normal(size=(10000, 4)), + index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), + ) + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) + + expected = pd.DataFrame( + [[768.0] * 4] * 12 + [[528.0] * 4], + index=pd.timedelta_range(start="1s", periods=13, freq="3s"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 8795af2e11122..6f5550a6f8209 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -698,3 +698,48 @@ def test_margin_normalize(self): names=["A", "B"], ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize_multiple_columns(self): + # GH 35144 + # use multiple columns with margins and normalization + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + result = crosstab( + index=df.C, + columns=[df.A, df.B], + margins=True, + margins_name="margin", + normalize=True, + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222, 0.000000, 0.444444], + [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], + [0.222222, 0.222222, 0.333333, 0.222222, 1.0], + ], + index=["large", "small", "margin"], + ) + expected.columns = MultiIndex( + levels=[["bar", "foo", "margin"], ["", "one", "two"]], + codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.index.name = "C" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py index 5252f1a4d5a24..b4c2b448e252a 100644 --- a/pandas/tests/scalar/interval/test_arithmetic.py +++ b/pandas/tests/scalar/interval/test_arithmetic.py @@ -45,3 +45,15 @@ def test_numeric_interval_add_timedelta_raises(interval, delta): with pytest.raises((TypeError, ValueError), match=msg): delta + interval + + +@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) +def test_timdelta_add_timestamp_interval(klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index a0151bb9ac7bf..8ad9a2c7a9c70 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -2,6 +2,7 @@ import pytest from pandas import Interval, Period, Timedelta, Timestamp +import pandas._testing as tm import pandas.core.common as com @@ -267,3 +268,11 @@ def test_constructor_errors_tz(self, tz_left, tz_right): msg = "left and right must have the same time zone" with pytest.raises(error, match=msg): Interval(left, right) + + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b174eb0e42776..d81e8a4f82ffb 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark import pandas as pd @@ -486,6 +487,7 @@ def test_empty_method(self): assert not full_series.empty @async_mark() + @td.check_file_leaks async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1dd410ad02ee0..bcf7039ec9039 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1449,3 +1449,18 @@ def test_constructor_datetimelike_scalar_to_string_dtype(self): result = Series("M", index=[1, 2, 3], dtype="string") expected = pd.Series(["M", "M", "M"], index=[1, 2, 3], dtype="string") tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + [np.datetime64("2012-01-01"), np.datetime64("2013-01-01")], + ["2012-01-01", "2013-01-01"], + ], + ) + def test_constructor_sparse_datetime64(self, values): + # https://github.com/pandas-dev/pandas/issues/35762 + dtype = pd.SparseDtype("datetime64[ns]") + result = pd.Series(values, dtype=dtype) + arr = pd.arrays.SparseArray(values, dtype=dtype) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6c6bdb6b1b2bd..67a2dc2303550 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -252,6 +252,19 @@ def test_object_factorize(self, writable): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_datetime64_factorize(self, writable): + # GH35650 Verify whether read-only datetime64 array can be factorized + data = np.array([np.datetime64("2020-01-01T00:00:00.000")]) + data.setflags(write=writable) + expected_codes = np.array([0], dtype=np.int64) + expected_uniques = np.array( + ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]" + ) + + codes, uniques = pd.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bcfed2d0d3a10..3d45a1f7389b7 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import Series, Timestamp +import pandas._testing as tm from pandas.core import ops import pandas.core.common as com @@ -157,3 +158,12 @@ def test_version_tag(): raise ValueError( "No git tags exist, please sync tags between upstream and your repo" ) + + +@pytest.mark.parametrize( + "obj", [(obj,) for obj in pd.__dict__.values() if callable(obj)] +) +def test_serializable(obj): + # GH 35611 + unpickled = tm.round_trip_pickle(obj) + assert type(obj) == type(unpickled) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 724558bd49ea2..274860b3fdb5c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1846,7 +1846,7 @@ def test_multilevel_index_loc_order(self, dim, keys, expected): # GH 22797 # Try to respect order of keys given for MultiIndex.loc kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} - df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs) exp_index = MultiIndex.from_arrays(expected) if dim == "index": res = df.loc[keys, :] diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 0d60e6e8a978f..c45e4508c6153 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -285,7 +285,7 @@ def test_nansum(self, skipna): def test_nanmean(self, skipna): self.check_funs( - nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False, + nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False ) def test_nanmean_overflow(self): diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 04e841c05e44a..fe5fc3e21d960 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -25,6 +25,7 @@ # https://github.com/pandas-dev/pandas/issues/35252 "ignore:Distutils:UserWarning" ) +@pytest.mark.filterwarnings("ignore:Setuptools is replacing distutils:UserWarning") def test_show_versions(capsys): # gh-32041 pd.show_versions() diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index a3de8aa69f840..158b994cf03ae 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -95,7 +95,7 @@ def test_rolling_apply_consistency( with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between rolling_xyz() and either (a) # rolling_apply of Series.xyz(), or (b) rolling_apply of @@ -107,7 +107,7 @@ def test_rolling_apply_consistency( functions = no_nan_functions + base_functions for (f, require_min_periods, name) in functions: rolling_f = getattr( - x.rolling(window=window, center=center, min_periods=min_periods), name, + x.rolling(window=window, center=center, min_periods=min_periods), name ) if ( @@ -492,7 +492,7 @@ def test_moment_functions_zero_length_pairwise(): df2["a"] = df2["a"].astype("float64") df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]), + index=pd.MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) ) df2_expected = DataFrame( index=pd.MultiIndex.from_product( @@ -635,7 +635,7 @@ def test_rolling_consistency(consistency_data, window, min_periods, center): # with empty/0-length Series/DataFrames with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between different rolling_* moments diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py index 89d46a8bb6cb5..a83bfabc4a048 100644 --- a/pandas/tests/window/moments/test_moments_ewm.py +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -73,7 +73,7 @@ def simple_wma(s, w): (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - (s2, True, False, [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan],), + (s2, True, False, [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan]), (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), ( s2, @@ -95,7 +95,7 @@ def simple_wma(s, w): alpha * ((1.0 - alpha) ** 2 + alpha), ], ), - (s3, False, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha],), + (s3, False, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha]), ]: expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 81f020fe7de23..da256e80dff7e 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -150,14 +150,14 @@ def get_result(obj, window, min_periods=None, center=False): series_xp = ( get_result( - series.reindex(list(series.index) + s), window=25, min_periods=minp, + series.reindex(list(series.index) + s), window=25, min_periods=minp ) .shift(-12) .reindex(series.index) ) frame_xp = ( get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=minp, + frame.reindex(list(frame.index) + s), window=25, min_periods=minp ) .shift(-12) .reindex(frame.index) @@ -169,14 +169,14 @@ def get_result(obj, window, min_periods=None, center=False): else: series_xp = ( get_result( - series.reindex(list(series.index) + s), window=25, min_periods=0, + series.reindex(list(series.index) + s), window=25, min_periods=0 ) .shift(-12) .reindex(series.index) ) frame_xp = ( get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=0, + frame.reindex(list(frame.index) + s), window=25, min_periods=0 ) .shift(-12) .reindex(frame.index) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 28e27791cad35..2c3d8b4608806 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, compat, concat +from pandas import DataFrame, Index, Series, Timestamp, concat import pandas._testing as tm from pandas.core.base import SpecificationError @@ -277,7 +277,7 @@ def test_preserve_metadata(): @pytest.mark.parametrize( "func,window_size,expected_vals", [ - pytest.param( + ( "rolling", 2, [ @@ -289,7 +289,6 @@ def test_preserve_metadata(): [35.0, 40.0, 60.0, 40.0], [60.0, 80.0, 85.0, 80], ], - marks=pytest.mark.xfail(not compat.IS64, reason="GH-35294"), ), ( "expanding", diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 2aaf6af103e98..bc38634da8941 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range import pandas._testing as tm @@ -142,7 +142,6 @@ def test_invalid_kwargs_nopython(): @pytest.mark.parametrize("args_kwargs", [[None, {"par": 10}], [(10,), None]]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_args_kwargs(args_kwargs): # GH 33433 def foo(x, par): diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 2300d8dd5529b..ab73e075eed04 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -88,8 +88,8 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( "func,np_func,expected,np_kwargs", [ - ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {},), - ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {},), + ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {}), + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}), ( "max", np.max, @@ -204,7 +204,7 @@ def test_rolling_forward_skewness(constructor): @pytest.mark.parametrize( "func,expected", [ - ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan],), + ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan]), ( "corr", [ diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index a9590c7e1233a..170bf100b3891 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series, compat +from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -23,7 +23,6 @@ def test_mutated(self): g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_getitem(self): g = self.frame.groupby("A") g_mutated = get_groupby(self.frame, by="A", mutated=True) @@ -56,7 +55,6 @@ def test_getitem_multiple(self): result = r.B.count() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling(self): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -74,7 +72,6 @@ def test_rolling(self): @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_quantile(self, interpolation): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -105,7 +102,6 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply(self, raw): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -115,7 +111,6 @@ def test_rolling_apply(self, raw): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_mutability(self): # GH 14013 df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) @@ -197,7 +192,6 @@ def test_expanding_apply(self, raw): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]]) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling(self, expected_value, raw_value): # GH 31754 diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index e82d4b8cbf770..7425cc5df4c2f 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -195,7 +195,7 @@ def test_cov_mulittindex(self): columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")]) index = range(3) - df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns,) + df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns) result = df.ewm(alpha=0.1).cov() diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index bea239a245a4f..67b20fd2d6daa 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -7,7 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series, compat, date_range +from pandas import DataFrame, Series, date_range import pandas._testing as tm from pandas.core.window import Rolling @@ -73,7 +73,7 @@ def test_constructor_with_timedelta_window(window): # GH 15440 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=pd.date_range("2015-12-24", periods=n, freq="D"), + {"value": np.arange(n)}, index=pd.date_range("2015-12-24", periods=n, freq="D") ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) @@ -92,7 +92,7 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): # GH 15305 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=pd.date_range("2017-08-08", periods=n, freq="D"), + {"value": np.arange(n)}, index=pd.date_range("2017-08-08", periods=n, freq="D") ) expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, @@ -150,11 +150,10 @@ def test_closed_one_entry(func): @pytest.mark.parametrize("func", ["min", "max"]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_closed_one_entry_groupby(func): # GH24718 ser = pd.DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=pd.date_range("2000", periods=3), + data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=pd.date_range("2000", periods=3) ) result = getattr( ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func @@ -183,7 +182,7 @@ def test_closed_one_entry_groupby(func): def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 ser = pd.Series( - data=np.arange(10).astype(input_dtype), index=pd.date_range("2000", periods=10), + data=np.arange(10).astype(input_dtype), index=pd.date_range("2000", periods=10) ) result = getattr(ser.rolling("3D", closed=closed), func)() @@ -683,7 +682,6 @@ def test_iter_rolling_datetime(expected, expected_index, window): ), ], ) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_positional_argument(grouping, _index, raw): # GH 34605 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 90f919d5565b0..8aa4d7103e48a 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -7,7 +7,6 @@ MultiIndex, Series, Timestamp, - compat, date_range, to_datetime, ) @@ -657,7 +656,6 @@ def agg_by_day(x): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_monotonic(self): # GH 15130 @@ -687,7 +685,6 @@ def test_groupby_monotonic(self): result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index f80ff1a53cd69..8ef6dac2862db 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -548,7 +548,7 @@ def is_superperiod(source, target) -> bool: def _maybe_coerce_freq(code) -> str: - """ we might need to coerce a code to a rule_code + """we might need to coerce a code to a rule_code and uppercase it Parameters diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 8ab37f787bd10..d8a3040919e7b 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -12,7 +12,7 @@ from pandas.tseries.offsets import Day, Easter -def next_monday(dt): +def next_monday(dt: datetime) -> datetime: """ If holiday falls on Saturday, use following Monday instead; if holiday falls on Sunday, use Monday instead @@ -24,7 +24,7 @@ def next_monday(dt): return dt -def next_monday_or_tuesday(dt): +def next_monday_or_tuesday(dt: datetime) -> datetime: """ For second holiday of two adjacent ones! If holiday falls on Saturday, use following Monday instead; @@ -39,7 +39,7 @@ def next_monday_or_tuesday(dt): return dt -def previous_friday(dt): +def previous_friday(dt: datetime) -> datetime: """ If holiday falls on Saturday or Sunday, use previous Friday instead. """ @@ -50,7 +50,7 @@ def previous_friday(dt): return dt -def sunday_to_monday(dt): +def sunday_to_monday(dt: datetime) -> datetime: """ If holiday falls on Sunday, use day thereafter (Monday) instead. """ @@ -59,7 +59,7 @@ def sunday_to_monday(dt): return dt -def weekend_to_monday(dt): +def weekend_to_monday(dt: datetime) -> datetime: """ If holiday falls on Sunday or Saturday, use day thereafter (Monday) instead. @@ -72,7 +72,7 @@ def weekend_to_monday(dt): return dt -def nearest_workday(dt): +def nearest_workday(dt: datetime) -> datetime: """ If holiday falls on Saturday, use day before (Friday) instead; if holiday falls on Sunday, use day thereafter (Monday) instead. @@ -84,7 +84,7 @@ def nearest_workday(dt): return dt -def next_workday(dt): +def next_workday(dt: datetime) -> datetime: """ returns next weekday used for observances """ @@ -95,7 +95,7 @@ def next_workday(dt): return dt -def previous_workday(dt): +def previous_workday(dt: datetime) -> datetime: """ returns previous weekday used for observances """ @@ -106,14 +106,14 @@ def previous_workday(dt): return dt -def before_nearest_workday(dt): +def before_nearest_workday(dt: datetime) -> datetime: """ returns previous workday after nearest workday """ return previous_workday(nearest_workday(dt)) -def after_nearest_workday(dt): +def after_nearest_workday(dt: datetime) -> datetime: """ returns next workday after nearest workday needed for Boxing day or multiple holidays in a series @@ -428,9 +428,11 @@ def holidays(self, start=None, end=None, return_name=False): # If we don't have a cache or the dates are outside the prior cache, we # get them again if self._cache is None or start < self._cache[0] or end > self._cache[1]: - holidays = [rule.dates(start, end, return_name=True) for rule in self.rules] - if holidays: - holidays = concat(holidays) + pre_holidays = [ + rule.dates(start, end, return_name=True) for rule in self.rules + ] + if pre_holidays: + holidays = concat(pre_holidays) else: holidays = Series(index=DatetimeIndex([]), dtype=object) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0dad8c7397e37..ca7b99492bbf7 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -186,10 +186,10 @@ def skip_if_no(package: str, min_version: Optional[str] = None): is_platform_windows(), reason="not used on win32" ) skip_if_has_locale = pytest.mark.skipif( - _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement" diff --git a/requirements-dev.txt b/requirements-dev.txt index 66e72641cd5bb..1fca25c9fecd9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.16.5,<1.19.0 +numpy>=1.16.5 python-dateutil>=2.7.3 pytz asv @@ -15,8 +15,8 @@ isort>=5.2.1 mypy==0.730 pycodestyle gitpython -gitdb2==2.0.6 -sphinx<=3.1.1 +gitdb +sphinx nbconvert>=5.4.1 nbsphinx pandoc @@ -32,6 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto +flask pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 @@ -47,7 +48,7 @@ bottleneck>=1.2.1 ipykernel ipython>=7.11.1 jinja2 -matplotlib>=2.2.2,<3.3.0 +matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.2 numba>=0.46.0 diff --git a/setup.cfg b/setup.cfg index e4c0b3dcf37ef..c10624d60aaff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -157,9 +157,6 @@ check_untyped_defs=False [mypy-pandas.core.computation.scope] check_untyped_defs=False -[mypy-pandas.core.dtypes.cast] -check_untyped_defs=False - [mypy-pandas.core.frame] check_untyped_defs=False @@ -187,9 +184,6 @@ check_untyped_defs=False [mypy-pandas.core.internals.blocks] check_untyped_defs=False -[mypy-pandas.core.internals.concat] -check_untyped_defs=False - [mypy-pandas.core.internals.construction] check_untyped_defs=False @@ -276,6 +270,3 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.misc] check_untyped_defs=False - -[mypy-pandas.tseries.holiday] -check_untyped_defs=False diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index 8eb2edebec817..39f63202e1986 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -2,7 +2,7 @@ ## Contributors -_pandas_ is made with love by more than [1,500 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors). +_pandas_ is made with love by more than [2,000 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors). If you want to support pandas development, you can find information in the [donations page](../donate.html). @@ -42,7 +42,7 @@ If you want to support pandas development, you can find information in the [dona > or anyone willing to increase the diversity of our team. > We have identified visible gaps and obstacles in sustaining diversity and inclusion in the open-source communities and we are proactive in increasing > the diversity of our team. -> We have a [code of conduct]({base_url}/community/coc.html) to ensure a friendly and welcoming environment. +> We have a [code of conduct](../community/coc.html) to ensure a friendly and welcoming environment. > Please send an email to [pandas-code-of-conduct-committee](mailto:pandas-coc@googlegroups.com), if you think we can do a > better job at achieving this goal. diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 23575cc123050..9a178d26659c3 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -79,6 +79,13 @@ maintainers: - datapythonista - simonjayhawkins - topper-123 + - alimcmaster1 + - bashtage + - charlesdong1991 + - Dr-Irv + - dsaxton + - MarcoGorelli + - rhshadrach emeritus: - Wouter Overmeire - Skipper Seabold