Skip to content

Commit 3fcdfce

Browse files
committed
Merge remote-tracking branch 'upstream/master' into datetime-type-inference
2 parents 0712b60 + 0db2286 commit 3fcdfce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+2093
-1237
lines changed

asv_bench/benchmarks/arithmetic.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
6767
self.ser.add(self.ser, fill_value=4)
6868

6969

70-
class MixedFrameWithSeriesAxis0:
70+
class MixedFrameWithSeriesAxis:
7171
params = [
7272
[
7373
"eq",
@@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
7878
"gt",
7979
"add",
8080
"sub",
81-
"div",
81+
"truediv",
8282
"floordiv",
8383
"mul",
8484
"pow",
@@ -87,15 +87,19 @@ class MixedFrameWithSeriesAxis0:
8787
param_names = ["opname"]
8888

8989
def setup(self, opname):
90-
arr = np.arange(10 ** 6).reshape(100, -1)
90+
arr = np.arange(10 ** 6).reshape(1000, -1)
9191
df = DataFrame(arr)
9292
df["C"] = 1.0
9393
self.df = df
9494
self.ser = df[0]
95+
self.row = df.iloc[0]
9596

9697
def time_frame_op_with_series_axis0(self, opname):
9798
getattr(self.df, opname)(self.ser, axis=0)
9899

100+
def time_frame_op_with_series_axis1(self, opname):
101+
getattr(operator, opname)(self.df, self.ser)
102+
99103

100104
class Ops:
101105

asv_bench/benchmarks/groupby.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,4 +660,62 @@ def function(values):
660660
self.grouper.transform(function, engine="cython")
661661

662662

663+
class AggEngine:
664+
def setup(self):
665+
N = 10 ** 3
666+
data = DataFrame(
667+
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
668+
columns=[0, 1],
669+
)
670+
self.grouper = data.groupby(0)
671+
672+
def time_series_numba(self):
673+
def function(values, index):
674+
total = 0
675+
for i, value in enumerate(values):
676+
if i % 2:
677+
total += value + 5
678+
else:
679+
total += value * 2
680+
return total
681+
682+
self.grouper[1].agg(function, engine="numba")
683+
684+
def time_series_cython(self):
685+
def function(values):
686+
total = 0
687+
for i, value in enumerate(values):
688+
if i % 2:
689+
total += value + 5
690+
else:
691+
total += value * 2
692+
return total
693+
694+
self.grouper[1].agg(function, engine="cython")
695+
696+
def time_dataframe_numba(self):
697+
def function(values, index):
698+
total = 0
699+
for i, value in enumerate(values):
700+
if i % 2:
701+
total += value + 5
702+
else:
703+
total += value * 2
704+
return total
705+
706+
self.grouper.agg(function, engine="numba")
707+
708+
def time_dataframe_cython(self):
709+
def function(values):
710+
total = 0
711+
for i, value in enumerate(values):
712+
if i % 2:
713+
total += value + 5
714+
else:
715+
total += value * 2
716+
return total
717+
718+
self.grouper.agg(function, engine="cython")
719+
720+
663721
from .pandas_vb_common import setup # noqa: F401 isort:skip

asv_bench/benchmarks/stat_ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ class FrameOps:
1111
param_names = ["op", "dtype", "axis"]
1212

1313
def setup(self, op, dtype, axis):
14-
if op == "mad" and dtype == "Int64" and axis == 1:
15-
# GH-33036
14+
if op == "mad" and dtype == "Int64":
15+
# GH-33036, GH#33600
1616
raise NotImplementedError
1717
values = np.random.randn(100000, 4)
1818
if dtype == "Int64":

doc/source/user_guide/computation.rst

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,8 @@ We provide a number of common statistical functions:
318318
:meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
319319
:meth:`~Rolling.quantile`, Sample quantile (value at %)
320320
:meth:`~Rolling.apply`, Generic apply
321-
:meth:`~Rolling.cov`, Unbiased covariance (binary)
322-
:meth:`~Rolling.corr`, Correlation (binary)
321+
:meth:`~Rolling.cov`, Sample covariance (binary)
322+
:meth:`~Rolling.corr`, Sample correlation (binary)
323323

324324
.. _computation.window_variance.caveats:
325325

@@ -341,6 +341,8 @@ We provide a number of common statistical functions:
341341
sample variance under the circumstances would result in a biased estimator
342342
of the variable we are trying to determine.
343343

344+
The same caveats apply to using any supported statistical sample methods.
345+
344346
.. _stats.rolling_apply:
345347

346348
Rolling apply
@@ -380,8 +382,8 @@ and their default values are set to ``False``, ``True`` and ``False`` respective
380382
.. note::
381383

382384
In terms of performance, **the first time a function is run using the Numba engine will be slow**
383-
as Numba will have some function compilation overhead. However, ``rolling`` objects will cache
384-
the function and subsequent calls will be fast. In general, the Numba engine is performant with
385+
as Numba will have some function compilation overhead. However, the compiled functions are cached,
386+
and subsequent calls will be fast. In general, the Numba engine is performant with
385387
a larger amount of data points (e.g. 1+ million).
386388

387389
.. code-block:: ipython
@@ -870,12 +872,12 @@ Method summary
870872
:meth:`~Expanding.max`, Maximum
871873
:meth:`~Expanding.std`, Sample standard deviation
872874
:meth:`~Expanding.var`, Sample variance
873-
:meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
874-
:meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
875+
:meth:`~Expanding.skew`, Sample skewness (3rd moment)
876+
:meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
875877
:meth:`~Expanding.quantile`, Sample quantile (value at %)
876878
:meth:`~Expanding.apply`, Generic apply
877-
:meth:`~Expanding.cov`, Unbiased covariance (binary)
878-
:meth:`~Expanding.corr`, Correlation (binary)
879+
:meth:`~Expanding.cov`, Sample covariance (binary)
880+
:meth:`~Expanding.corr`, Sample correlation (binary)
879881

880882
.. note::
881883

@@ -884,6 +886,8 @@ Method summary
884886
windows. See :ref:`this section <computation.window_variance.caveats>` for more
885887
information.
886888

889+
The same caveats apply to using any supported statistical sample methods.
890+
887891
.. currentmodule:: pandas
888892

889893
Aside from not having a ``window`` parameter, these functions have the same

doc/source/user_guide/groupby.rst

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,73 @@ that is itself a series, and possibly upcast the result to a DataFrame:
10211021
the output as well as set the indices.
10221022

10231023

1024+
Numba Accelerated Routines
1025+
--------------------------
1026+
1027+
.. versionadded:: 1.1
1028+
1029+
If `Numba <https://numba.pydata.org/>`__ is installed as an optional dependency, the ``transform`` and
1030+
``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
1031+
argument is a dictionary of keyword arguments that will be passed into the
1032+
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
1033+
These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
1034+
and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.
1035+
1036+
The function signature must start with ``values, index`` **exactly** as the data belonging to each group
1037+
will be passed into ``values``, and the group index will be passed into ``index``.
1038+
1039+
.. warning::
1040+
1041+
When using ``engine='numba'``, there will be no "fall back" behavior internally. The group
1042+
data and group index will be passed as numpy arrays to the JITed user defined function, and no
1043+
alternative execution attempts will be tried.
1044+
1045+
.. note::
1046+
1047+
In terms of performance, **the first time a function is run using the Numba engine will be slow**
1048+
as Numba will have some function compilation overhead. However, the compiled functions are cached,
1049+
and subsequent calls will be fast. In general, the Numba engine is performant with
1050+
a larger amount of data points (e.g. 1+ million).
1051+
1052+
.. code-block:: ipython
1053+
1054+
In [1]: N = 10 ** 3
1055+
1056+
In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
1057+
1058+
In [3]: df = pd.DataFrame(data, columns=[0, 1])
1059+
1060+
In [4]: def f_numba(values, index):
1061+
...: total = 0
1062+
...: for i, value in enumerate(values):
1063+
...: if i % 2:
1064+
...: total += value + 5
1065+
...: else:
1066+
...: total += value * 2
1067+
...: return total
1068+
...:
1069+
1070+
In [5]: def f_cython(values):
1071+
...: total = 0
1072+
...: for i, value in enumerate(values):
1073+
...: if i % 2:
1074+
...: total += value + 5
1075+
...: else:
1076+
...: total += value * 2
1077+
...: return total
1078+
...:
1079+
1080+
In [6]: groupby = df.groupby(0)
1081+
# Run the first time, compilation time will affect performance
1082+
In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
1083+
2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1084+
# Function is cached and performance will improve
1085+
In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
1086+
4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1087+
1088+
In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
1089+
18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1090+
10241091
Other useful features
10251092
---------------------
10261093

doc/source/whatsnew/v1.1.0.rst

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ Other enhancements
9898
This can be used to set a custom compression level, e.g.,
9999
``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}``
100100
(:issue:`33196`)
101-
- :meth:`~pandas.core.groupby.GroupBy.transform` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`)
101+
- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
102102
- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
103103
-
104104

@@ -175,8 +175,8 @@ Other API changes
175175
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
176176
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
177177
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
178-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``skew``, ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
179-
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
178+
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``cov``, ``corr`` will now raise a ``NotImplementedError`` (:issue:`32865`)
179+
- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`)
180180
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
181181
-
182182

@@ -191,6 +191,7 @@ Backwards incompatible API changes
191191
Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
192192
- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
193193
- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
194+
- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`)
194195

195196
``MultiIndex.get_indexer`` interprets `method` argument differently
196197
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -324,6 +325,36 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss
324325
...
325326
KeyError: Timestamp('1970-01-01 00:00:00')
326327
328+
.. _whatsnew_110.api_breaking.indexing_int_multiindex_raises_key_errors:
329+
330+
Failed Integer Lookups on MultiIndex Raise KeyError
331+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
332+
Indexing with integers with a :class:`MultiIndex` that has a integer-dtype
333+
first level incorrectly failed to raise ``KeyError`` when one or more of
334+
those integer keys is not present in the first level of the index (:issue:`33539`)
335+
336+
.. ipython:: python
337+
338+
idx = pd.Index(range(4))
339+
dti = pd.date_range("2000-01-03", periods=3)
340+
mi = pd.MultiIndex.from_product([idx, dti])
341+
ser = pd.Series(range(len(mi)), index=mi)
342+
343+
*Previous behavior*:
344+
345+
.. code-block:: ipython
346+
347+
In [5]: ser[[5]]
348+
Out[5]: Series([], dtype: int64)
349+
350+
*New behavior*:
351+
352+
.. code-block:: ipython
353+
354+
In [5]: ser[[5]]
355+
...
356+
KeyError: '[5] not in index'
357+
327358
:meth:`DataFrame.merge` preserves right frame's row order
328359
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
329360
:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
@@ -419,6 +450,7 @@ Performance improvements
419450
- Performance improvement in :class:`Timedelta` constructor (:issue:`30543`)
420451
- Performance improvement in :class:`Timestamp` constructor (:issue:`30543`)
421452
- Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`)
453+
- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`)
422454
- The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index,
423455
avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of
424456
existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`)
@@ -501,6 +533,7 @@ Strings
501533
^^^^^^^
502534

503535
- Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`).
536+
- Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`)
504537
- Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`)
505538

506539

@@ -527,6 +560,7 @@ Indexing
527560
- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`)
528561
- Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`)
529562
- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`)
563+
- Bug in :class:`Interval` where a :class:`Timedelta` could not be added or subtracted from a :class:`Timestamp` interval (:issue:`32023`)
530564
- Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`)
531565
- Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`)
532566
- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`)
@@ -662,6 +696,7 @@ Other
662696
- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`)
663697
- Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`)
664698
- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`)
699+
- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`)
665700

666701
.. ---------------------------------------------------------------------------
667702

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna):
125125
{{if dtype == 'object'}}
126126
def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
127127
{{else}}
128-
def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'):
128+
def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'):
129129
{{endif}}
130130
cdef:
131131
int ret = 0

0 commit comments

Comments
 (0)