Skip to content

Commit a989f51

Browse files
committed
Merge remote-tracking branch 'upstream/main' into update-value_counts
2 parents fcc827d + 852518e commit a989f51

File tree

82 files changed

+1373
-589
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1373
-589
lines changed

.github/workflows/wheels.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ jobs:
8686
activate-environment: test
8787
channels: conda-forge, anaconda
8888
channel-priority: true
89-
mamba-version: "*"
89+
# mamba fails to solve, also we really don't need this since we're just installing python
90+
# mamba-version: "*"
9091

9192
- name: Test wheels (Windows 64-bit only)
9293
if: ${{ matrix.buildplat[1] == 'win_amd64' }}
@@ -154,7 +155,8 @@ jobs:
154155
python-version: '3.8'
155156
channels: conda-forge
156157
channel-priority: true
157-
mamba-version: "*"
158+
# mamba fails to solve, also we really don't need this since we're just installing python
159+
# mamba-version: "*"
158160

159161
- name: Build sdist
160162
run: |

asv_bench/benchmarks/indexing.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import numpy as np
99

1010
from pandas import (
11+
NA,
1112
CategoricalIndex,
1213
DataFrame,
1314
Index,
@@ -83,6 +84,37 @@ def time_loc_slice(self, index, index_structure):
8384
self.data.loc[:800000]
8485

8586

87+
class NumericMaskedIndexing:
88+
monotonic_list = list(range(10**6))
89+
non_monotonic_list = (
90+
list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1))
91+
)
92+
93+
params = [
94+
("Int64", "UInt64", "Float64"),
95+
(True, False),
96+
]
97+
param_names = ["dtype", "monotonic"]
98+
99+
def setup(self, dtype, monotonic):
100+
101+
indices = {
102+
True: Index(self.monotonic_list, dtype=dtype),
103+
False: Index(self.non_monotonic_list, dtype=dtype).append(
104+
Index([NA], dtype=dtype)
105+
),
106+
}
107+
self.data = indices[monotonic]
108+
self.indexer = np.arange(300, 1_000)
109+
self.data_dups = self.data.append(self.data)
110+
111+
def time_get_indexer(self, dtype, monotonic):
112+
self.data.get_indexer(self.indexer)
113+
114+
def time_get_indexer_dups(self, dtype, monotonic):
115+
self.data.get_indexer_for(self.indexer)
116+
117+
86118
class NonNumericSeriesIndexing:
87119

88120
params = [

asv_bench/benchmarks/indexing_engines.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""
2-
Benchmarks in this file depend exclusively on code in _libs/
2+
Benchmarks in this file depend mostly on code in _libs/
3+
4+
We have to created masked arrays to test the masked engine though. The
5+
array is unpacked on the Cython level.
36
47
If a PR does not edit anything in _libs, it is very unlikely that benchmarks
58
in this file will be affected.
@@ -9,6 +12,8 @@
912

1013
from pandas._libs import index as libindex
1114

15+
from pandas.core.arrays import BaseMaskedArray
16+
1217

1318
def _get_numeric_engines():
1419
engine_names = [
@@ -30,6 +35,26 @@ def _get_numeric_engines():
3035
]
3136

3237

38+
def _get_masked_engines():
39+
engine_names = [
40+
("MaskedInt64Engine", "Int64"),
41+
("MaskedInt32Engine", "Int32"),
42+
("MaskedInt16Engine", "Int16"),
43+
("MaskedInt8Engine", "Int8"),
44+
("MaskedUInt64Engine", "UInt64"),
45+
("MaskedUInt32Engine", "UInt32"),
46+
("MaskedUInt16engine", "UInt16"),
47+
("MaskedUInt8Engine", "UInt8"),
48+
("MaskedFloat64Engine", "Float64"),
49+
("MaskedFloat32Engine", "Float32"),
50+
]
51+
return [
52+
(getattr(libindex, engine_name), dtype)
53+
for engine_name, dtype in engine_names
54+
if hasattr(libindex, engine_name)
55+
]
56+
57+
3358
class NumericEngineIndexing:
3459

3560
params = [
@@ -80,6 +105,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
80105
self.data.get_loc(self.key_middle)
81106

82107

108+
class MaskedNumericEngineIndexing:
109+
110+
params = [
111+
_get_masked_engines(),
112+
["monotonic_incr", "monotonic_decr", "non_monotonic"],
113+
[True, False],
114+
[10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF
115+
]
116+
param_names = ["engine_and_dtype", "index_type", "unique", "N"]
117+
118+
def setup(self, engine_and_dtype, index_type, unique, N):
119+
engine, dtype = engine_and_dtype
120+
121+
if index_type == "monotonic_incr":
122+
if unique:
123+
arr = np.arange(N * 3, dtype=dtype.lower())
124+
else:
125+
values = list([1] * N + [2] * N + [3] * N)
126+
arr = np.array(values, dtype=dtype.lower())
127+
mask = np.zeros(N * 3, dtype=np.bool_)
128+
elif index_type == "monotonic_decr":
129+
if unique:
130+
arr = np.arange(N * 3, dtype=dtype.lower())[::-1]
131+
else:
132+
values = list([1] * N + [2] * N + [3] * N)
133+
arr = np.array(values, dtype=dtype.lower())[::-1]
134+
mask = np.zeros(N * 3, dtype=np.bool_)
135+
else:
136+
assert index_type == "non_monotonic"
137+
if unique:
138+
arr = np.zeros(N * 3, dtype=dtype.lower())
139+
arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower())
140+
arr[N:] = np.arange(N * 2, dtype=dtype.lower())
141+
142+
else:
143+
arr = np.array([1, 2, 3] * N, dtype=dtype.lower())
144+
mask = np.zeros(N * 3, dtype=np.bool_)
145+
mask[-1] = True
146+
147+
self.data = engine(BaseMaskedArray(arr, mask))
148+
# code belows avoids populating the mapping etc. while timing.
149+
self.data.get_loc(2)
150+
151+
self.key_middle = arr[len(arr) // 2]
152+
self.key_early = arr[2]
153+
154+
def time_get_loc(self, engine_and_dtype, index_type, unique, N):
155+
self.data.get_loc(self.key_early)
156+
157+
def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N):
158+
# searchsorted performance may be different near the middle of a range
159+
# vs near an endpoint
160+
self.data.get_loc(self.key_middle)
161+
162+
83163
class ObjectEngineIndexing:
84164

85165
params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]

doc/source/development/contributing_docstring.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@ case of pandas, the NumPy docstring convention is followed. These conventions ar
6767
explained in this document:
6868

6969
* `numpydoc docstring guide <https://numpydoc.readthedocs.io/en/latest/format.html>`_
70-
(which is based in the original `Guide to NumPy/SciPy documentation
71-
<https://github.com/numpy/numpy/blob/main/doc/HOWTO_DOCUMENT.rst.txt>`_)
7270

7371
numpydoc is a Sphinx extension to support the NumPy docstring convention.
7472

doc/source/development/maintaining.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,8 @@ which will be triggered when the tag is pushed.
465465

466466
7. Download all wheels from the Anaconda repository where MacPython uploads them:
467467
https://anaconda.org/multibuild-wheels-staging/pandas/files?version=<version>
468-
to the ``dist/`` directory in the local pandas copy.
468+
to the ``dist/`` directory in the local pandas copy. You can use the script
469+
``scripts/download_wheels.sh`` to download all wheels at once.
469470

470471
8. Upload wheels to PyPI:
471472

doc/source/whatsnew/v1.1.4.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Fixed regressions
3131
- Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`)
3232
- Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`)
3333
- Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`)
34-
- Fixed regression in inplace arithmetic operation on a Series not updating the parent DataFrame (:issue:`36373`)
34+
- Fixed regression in inplace arithmetic operation (`+=`) on a Series not updating the parent DataFrame/Series (:issue:`36373`)
3535

3636
.. ---------------------------------------------------------------------------
3737

doc/source/whatsnew/v2.0.0.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,13 +630,16 @@ Other API changes
630630
Deprecations
631631
~~~~~~~~~~~~
632632
- Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
633+
- Deprecated behavior of :func:`to_datetime` with ``unit`` when parsing strings, in a future version these will be parsed as datetimes (matching unit-less behavior) instead of cast to floats. To retain the old behavior, cast strings to numeric types before calling :func:`to_datetime` (:issue:`50735`)
633634
- Deprecated :func:`pandas.io.sql.execute` (:issue:`50185`)
634635
- :meth:`Index.is_boolean` has been deprecated. Use :func:`pandas.api.types.is_bool_dtype` instead (:issue:`50042`)
635636
- :meth:`Index.is_integer` has been deprecated. Use :func:`pandas.api.types.is_integer_dtype` instead (:issue:`50042`)
636637
- :meth:`Index.is_floating` has been deprecated. Use :func:`pandas.api.types.is_float_dtype` instead (:issue:`50042`)
637638
- :meth:`Index.holds_integer` has been deprecated. Use :func:`pandas.api.types.infer_dtype` instead (:issue:`50243`)
638639
- :meth:`Index.is_categorical` has been deprecated. Use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`50042`)
640+
- :meth:`Index.is_object` has been deprecated. Use :func:`pandas.api.types.is_object_dtype` instead (:issue:`50042`)
639641
- :meth:`Index.is_interval` has been deprecated. Use :func:`pandas.api.types.is_intterval_dtype` instead (:issue:`50042`)
642+
-
640643

641644
.. ---------------------------------------------------------------------------
642645
.. _whatsnew_200.prior_deprecations:
@@ -904,6 +907,7 @@ Performance improvements
904907
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
905908
- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
906909
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
910+
- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`)
907911
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
908912
- Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
909913
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
@@ -1021,7 +1025,7 @@ Conversion
10211025

10221026
Strings
10231027
^^^^^^^
1024-
- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`)
1028+
- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` or :class:`ArrowDtype` with ``pyarrow.string()`` (:issue:`15585`)
10251029
- Bug in converting string dtypes to "datetime64[ns]" or "timedelta64[ns]" incorrectly raising ``TypeError`` (:issue:`36153`)
10261030
-
10271031

@@ -1100,6 +1104,8 @@ Period
11001104
- Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`)
11011105
- Bug in adding a :class:`Period` object to an array of :class:`DateOffset` objects incorrectly raising ``TypeError`` (:issue:`50162`)
11021106
- Bug in :class:`Period` where passing a string with finer resolution than nanosecond would result in a ``KeyError`` instead of dropping the extra precision (:issue:`50417`)
1107+
- Bug in parsing strings representing Week-periods e.g. "2017-01-23/2017-01-29" as minute-frequency instead of week-frequency (:issue:`50803`)
1108+
-
11031109

11041110
Plotting
11051111
^^^^^^^^
@@ -1129,6 +1135,7 @@ Groupby/resample/rolling
11291135
- Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
11301136
- Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
11311137
- Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
1138+
- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
11321139
-
11331140

11341141
Reshaping

pandas/_libs/hashtable.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,12 @@ class HashTable:
165165
def map_locations(
166166
self,
167167
values: np.ndarray, # np.ndarray[subclass-specific]
168+
mask: npt.NDArray[np.bool_] | None = ...,
168169
) -> None: ...
169170
def lookup(
170171
self,
171172
values: np.ndarray, # np.ndarray[subclass-specific]
173+
mask: npt.NDArray[np.bool_] | None = ...,
172174
) -> npt.NDArray[np.intp]: ...
173175
def get_labels(
174176
self,

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,8 +1005,9 @@ cdef class StringHashTable(HashTable):
10051005
return labels
10061006

10071007
@cython.boundscheck(False)
1008-
def lookup(self, ndarray[object] values) -> ndarray:
1008+
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
10091009
# -> np.ndarray[np.intp]
1010+
# mask not yet implemented
10101011
cdef:
10111012
Py_ssize_t i, n = len(values)
10121013
int ret = 0
@@ -1041,7 +1042,8 @@ cdef class StringHashTable(HashTable):
10411042
return np.asarray(locs)
10421043

10431044
@cython.boundscheck(False)
1044-
def map_locations(self, ndarray[object] values) -> None:
1045+
def map_locations(self, ndarray[object] values, object mask = None) -> None:
1046+
# mask not yet implemented
10451047
cdef:
10461048
Py_ssize_t i, n = len(values)
10471049
int ret = 0
@@ -1314,7 +1316,8 @@ cdef class PyObjectHashTable(HashTable):
13141316
else:
13151317
raise KeyError(key)
13161318

1317-
def map_locations(self, ndarray[object] values) -> None:
1319+
def map_locations(self, ndarray[object] values, object mask = None) -> None:
1320+
# mask not yet implemented
13181321
cdef:
13191322
Py_ssize_t i, n = len(values)
13201323
int ret = 0
@@ -1328,8 +1331,9 @@ cdef class PyObjectHashTable(HashTable):
13281331
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
13291332
self.table.vals[k] = i
13301333

1331-
def lookup(self, ndarray[object] values) -> ndarray:
1334+
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
13321335
# -> np.ndarray[np.intp]
1336+
# mask not yet implemented
13331337
cdef:
13341338
Py_ssize_t i, n = len(values)
13351339
int ret = 0

pandas/_libs/index.pyi

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ class IndexEngine:
2929
targets: np.ndarray,
3030
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
3131

32+
class MaskedIndexEngine(IndexEngine):
33+
def __init__(self, values: object) -> None: ...
34+
def get_indexer_non_unique(
35+
self, targets: object
36+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
37+
3238
class Float64Engine(IndexEngine): ...
3339
class Float32Engine(IndexEngine): ...
3440
class Complex128Engine(IndexEngine): ...
@@ -46,6 +52,19 @@ class DatetimeEngine(Int64Engine): ...
4652
class TimedeltaEngine(DatetimeEngine): ...
4753
class PeriodEngine(Int64Engine): ...
4854
class BoolEngine(UInt8Engine): ...
55+
class MaskedBoolEngine(MaskedUInt8Engine): ...
56+
class MaskedFloat64Engine(MaskedIndexEngine): ...
57+
class MaskedFloat32Engine(MaskedIndexEngine): ...
58+
class MaskedComplex128Engine(MaskedIndexEngine): ...
59+
class MaskedComplex64Engine(MaskedIndexEngine): ...
60+
class MaskedInt64Engine(MaskedIndexEngine): ...
61+
class MaskedInt32Engine(MaskedIndexEngine): ...
62+
class MaskedInt16Engine(MaskedIndexEngine): ...
63+
class MaskedInt8Engine(MaskedIndexEngine): ...
64+
class MaskedUInt64Engine(MaskedIndexEngine): ...
65+
class MaskedUInt32Engine(MaskedIndexEngine): ...
66+
class MaskedUInt16Engine(MaskedIndexEngine): ...
67+
class MaskedUInt8Engine(MaskedIndexEngine): ...
4968

5069
class BaseMultiIndexCodesEngine:
5170
levels: list[np.ndarray]
@@ -57,10 +76,7 @@ class BaseMultiIndexCodesEngine:
5776
labels: list[np.ndarray], # all entries integer-dtyped
5877
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
5978
) -> None: ...
60-
def get_indexer(
61-
self,
62-
target: npt.NDArray[np.object_],
63-
) -> npt.NDArray[np.intp]: ...
79+
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
6480
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
6581
def get_indexer_with_fill(
6682
self,

0 commit comments

Comments
 (0)