Skip to content

Commit f43078b

Browse files
committed
Merge branch 'main' into to-json-append-mode
2 parents a8cc86d + 381dce3 commit f43078b

File tree

30 files changed

+472
-270
lines changed

30 files changed

+472
-270
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,29 @@ def time_duplicated(self, unique, keep, dtype):
9595
self.idx.duplicated(keep=keep)
9696

9797

98+
class DuplicatedMaskedArray:
99+
100+
params = [
101+
[True, False],
102+
["first", "last", False],
103+
["Int64", "Float64"],
104+
]
105+
param_names = ["unique", "keep", "dtype"]
106+
107+
def setup(self, unique, keep, dtype):
108+
N = 10**5
109+
data = pd.Series(np.arange(N), dtype=dtype)
110+
data[list(range(1, N, 100))] = pd.NA
111+
if not unique:
112+
data = data.repeat(5)
113+
self.ser = data
114+
# cache is_unique
115+
self.ser.is_unique
116+
117+
def time_duplicated(self, unique, keep, dtype):
118+
self.ser.duplicated(keep=keep)
119+
120+
98121
class Hashing:
99122
def setup_cache(self):
100123
N = 10**5

doc/source/reference/arrays.rst

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,21 @@ objects contained with a :class:`Index`, :class:`Series`, or
1919
For some data types, pandas extends NumPy's type system. String aliases for these types
2020
can be found at :ref:`basics.dtypes`.
2121

22-
=================== ========================= ============================= =============================
23-
Kind of Data pandas Data Type Scalar Array
24-
=================== ========================= ============================= =============================
25-
TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime`
26-
Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta`
27-
Period (time spans) :class:`PeriodDtype` :class:`Period` :ref:`api.arrays.period`
28-
Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.arrays.interval`
29-
Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na`
30-
Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical`
31-
Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse`
32-
Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string`
33-
Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool`
34-
PyArrow :class:`ArrowDtype` Python Scalars or :class:`NA` :ref:`api.arrays.arrow`
35-
=================== ========================= ============================= =============================
22+
=================== ========================== ============================= =============================
23+
Kind of Data pandas Data Type Scalar Array
24+
=================== ========================== ============================= =============================
25+
TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime`
26+
Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta`
27+
Period (time spans) :class:`PeriodDtype` :class:`Period` :ref:`api.arrays.period`
28+
Intervals :class:`IntervalDtype` :class:`Interval` :ref:`api.arrays.interval`
29+
Nullable Integer :class:`Int64Dtype`, ... (none) :ref:`api.arrays.integer_na`
30+
Nullable Float :class:`Float64Dtype`, ... (none) :ref:`api.arrays.float_na`
31+
Categorical :class:`CategoricalDtype` (none) :ref:`api.arrays.categorical`
32+
Sparse :class:`SparseDtype` (none) :ref:`api.arrays.sparse`
33+
Strings :class:`StringDtype` :class:`str` :ref:`api.arrays.string`
34+
Nullable Boolean :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool`
35+
PyArrow :class:`ArrowDtype` Python Scalars or :class:`NA` :ref:`api.arrays.arrow`
36+
=================== ========================== ============================= =============================
3637

3738
pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`).
3839
The top-level :meth:`array` method can be used to create a new array, which may be
@@ -91,13 +92,20 @@ with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-
9192
or timezone-aware values.
9293

9394
:class:`Timestamp`, a subclass of :class:`datetime.datetime`, is pandas'
94-
scalar type for timezone-naive or timezone-aware datetime data.
95+
scalar type for timezone-naive or timezone-aware datetime data. :class:`NaT`
96+
is the missing value for datetime data.
9597

9698
.. autosummary::
9799
:toctree: api/
98100

99101
Timestamp
100102

103+
.. autosummary::
104+
:toctree: api/
105+
:template: autosummary/class_without_autosummary.rst
106+
107+
NaT
108+
101109
Properties
102110
~~~~~~~~~~
103111
.. autosummary::
@@ -208,13 +216,20 @@ Timedeltas
208216
----------
209217

210218
NumPy can natively represent timedeltas. pandas provides :class:`Timedelta`
211-
for symmetry with :class:`Timestamp`.
219+
for symmetry with :class:`Timestamp`. :class:`NaT`
220+
is the missing value for timedelta data.
212221

213222
.. autosummary::
214223
:toctree: api/
215224

216225
Timedelta
217226

227+
.. autosummary::
228+
:toctree: api/
229+
:template: autosummary/class_without_autosummary.rst
230+
231+
NaT
232+
218233
Properties
219234
~~~~~~~~~~
220235
.. autosummary::
@@ -419,6 +434,26 @@ pandas provides this through :class:`arrays.IntegerArray`.
419434
UInt16Dtype
420435
UInt32Dtype
421436
UInt64Dtype
437+
NA
438+
439+
.. _api.arrays.float_na:
440+
441+
Nullable float
442+
--------------
443+
444+
.. autosummary::
445+
:toctree: api/
446+
:template: autosummary/class_without_autosummary.rst
447+
448+
arrays.FloatingArray
449+
450+
.. autosummary::
451+
:toctree: api/
452+
:template: autosummary/class_without_autosummary.rst
453+
454+
Float32Dtype
455+
Float64Dtype
456+
NA
422457

423458
.. _api.arrays.categorical:
424459

@@ -555,6 +590,7 @@ with a bool :class:`numpy.ndarray`.
555590
:template: autosummary/class_without_autosummary.rst
556591

557592
BooleanDtype
593+
NA
558594

559595

560596
.. Dtype attributes which are manually listed in their docstrings: including

doc/source/reference/general_functions.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Data manipulations
2626
from_dummies
2727
factorize
2828
unique
29+
lreshape
2930
wide_to_long
3031

3132
Top-level missing data

doc/source/reference/groupby.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ Indexing, iteration
2727

2828
Grouper
2929

30+
Function application helper
31+
---------------------------
32+
.. autosummary::
33+
:toctree: api/
34+
35+
NamedAgg
36+
3037
.. currentmodule:: pandas.core.groupby
3138

3239
Function application

doc/source/reference/options.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,10 @@ Working with options
1919
get_option
2020
set_option
2121
option_context
22+
23+
Numeric formatting
24+
------------------
25+
.. autosummary::
26+
:toctree: api/
27+
28+
set_eng_float_format

doc/source/whatsnew/v1.6.0.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Other enhancements
3131
- :meth:`.GroupBy.quantile` now preserving nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
3232
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
3333
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
34+
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
3435
- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
3536

3637
.. ---------------------------------------------------------------------------
@@ -104,10 +105,12 @@ Performance improvements
104105
~~~~~~~~~~~~~~~~~~~~~~~~
105106
- Performance improvement in :meth:`.GroupBy.median` for nullable dtypes (:issue:`37493`)
106107
- Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`)
108+
- Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`)
107109
- Performance improvement in :meth:`.GroupBy.mean` and :meth:`.GroupBy.var` for extension array dtypes (:issue:`37493`)
108110
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
109111
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
110112
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
113+
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
111114
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
112115
-
113116

@@ -124,7 +127,7 @@ Categorical
124127

125128
Datetimelike
126129
^^^^^^^^^^^^
127-
-
130+
- Bug in :func:`pandas.infer_freq`, raising ``TypeError`` when inferred on :class:`RangeIndex` (:issue:`47084`)
128131
-
129132

130133
Timedelta
@@ -171,7 +174,7 @@ Missing
171174
MultiIndex
172175
^^^^^^^^^^
173176
- Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`)
174-
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`)
177+
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`)
175178
- Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
176179
-
177180

pandas/_libs/hashtable.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ class IntpHashTable(HashTable): ...
183183
def duplicated(
184184
values: np.ndarray,
185185
keep: Literal["last", "first", False] = ...,
186+
mask: npt.NDArray[np.bool_] | None = ...,
186187
) -> npt.NDArray[np.bool_]: ...
187188
def mode(
188189
values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ...

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 60 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
118118
@cython.wraparound(False)
119119
@cython.boundscheck(False)
120120
{{if dtype == 'object'}}
121-
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'):
121+
cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
122122
{{else}}
123-
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
123+
cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
124124
{{endif}}
125125
cdef:
126126
int ret = 0
@@ -129,10 +129,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
129129
{{else}}
130130
PyObject* value
131131
{{endif}}
132-
Py_ssize_t i, n = len(values)
132+
Py_ssize_t i, n = len(values), first_na = -1
133133
khiter_t k
134134
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
135135
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
136+
bint seen_na = False, uses_mask = mask is not None
137+
bint seen_multiple_na = False
136138

137139
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
138140

@@ -147,9 +149,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
147149
{{endif}}
148150
for i in range(n - 1, -1, -1):
149151
# equivalent: range(n)[::-1], which cython doesn't like in nogil
150-
value = {{to_c_type}}(values[i])
151-
kh_put_{{ttype}}(table, value, &ret)
152-
out[i] = ret == 0
152+
if uses_mask and mask[i]:
153+
if seen_na:
154+
out[i] = True
155+
else:
156+
out[i] = False
157+
seen_na = True
158+
else:
159+
value = {{to_c_type}}(values[i])
160+
kh_put_{{ttype}}(table, value, &ret)
161+
out[i] = ret == 0
153162

154163
elif keep == 'first':
155164
{{if dtype == 'object'}}
@@ -158,9 +167,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
158167
with nogil:
159168
{{endif}}
160169
for i in range(n):
161-
value = {{to_c_type}}(values[i])
162-
kh_put_{{ttype}}(table, value, &ret)
163-
out[i] = ret == 0
170+
if uses_mask and mask[i]:
171+
if seen_na:
172+
out[i] = True
173+
else:
174+
out[i] = False
175+
seen_na = True
176+
else:
177+
value = {{to_c_type}}(values[i])
178+
kh_put_{{ttype}}(table, value, &ret)
179+
out[i] = ret == 0
164180

165181
else:
166182
{{if dtype == 'object'}}
@@ -169,15 +185,28 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'):
169185
with nogil:
170186
{{endif}}
171187
for i in range(n):
172-
value = {{to_c_type}}(values[i])
173-
k = kh_get_{{ttype}}(table, value)
174-
if k != table.n_buckets:
175-
out[table.vals[k]] = 1
176-
out[i] = 1
188+
if uses_mask and mask[i]:
189+
if not seen_na:
190+
first_na = i
191+
seen_na = True
192+
out[i] = 0
193+
elif not seen_multiple_na:
194+
out[i] = 1
195+
out[first_na] = 1
196+
seen_multiple_na = True
197+
else:
198+
out[i] = 1
199+
177200
else:
178-
k = kh_put_{{ttype}}(table, value, &ret)
179-
table.vals[k] = i
180-
out[i] = 0
201+
value = {{to_c_type}}(values[i])
202+
k = kh_get_{{ttype}}(table, value)
203+
if k != table.n_buckets:
204+
out[table.vals[k]] = 1
205+
out[i] = 1
206+
else:
207+
k = kh_put_{{ttype}}(table, value, &ret)
208+
table.vals[k] = i
209+
out[i] = 0
181210

182211
kh_destroy_{{ttype}}(table)
183212
return out
@@ -301,37 +330,37 @@ cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=N
301330
raise TypeError(values.dtype)
302331

303332

304-
cpdef duplicated(ndarray[htfunc_t] values, object keep="first"):
333+
cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
305334
if htfunc_t is object:
306-
return duplicated_object(values, keep)
335+
return duplicated_object(values, keep, mask=mask)
307336

308337
elif htfunc_t is int8_t:
309-
return duplicated_int8(values, keep)
338+
return duplicated_int8(values, keep, mask=mask)
310339
elif htfunc_t is int16_t:
311-
return duplicated_int16(values, keep)
340+
return duplicated_int16(values, keep, mask=mask)
312341
elif htfunc_t is int32_t:
313-
return duplicated_int32(values, keep)
342+
return duplicated_int32(values, keep, mask=mask)
314343
elif htfunc_t is int64_t:
315-
return duplicated_int64(values, keep)
344+
return duplicated_int64(values, keep, mask=mask)
316345

317346
elif htfunc_t is uint8_t:
318-
return duplicated_uint8(values, keep)
347+
return duplicated_uint8(values, keep, mask=mask)
319348
elif htfunc_t is uint16_t:
320-
return duplicated_uint16(values, keep)
349+
return duplicated_uint16(values, keep, mask=mask)
321350
elif htfunc_t is uint32_t:
322-
return duplicated_uint32(values, keep)
351+
return duplicated_uint32(values, keep, mask=mask)
323352
elif htfunc_t is uint64_t:
324-
return duplicated_uint64(values, keep)
353+
return duplicated_uint64(values, keep, mask=mask)
325354

326355
elif htfunc_t is float64_t:
327-
return duplicated_float64(values, keep)
356+
return duplicated_float64(values, keep, mask=mask)
328357
elif htfunc_t is float32_t:
329-
return duplicated_float32(values, keep)
358+
return duplicated_float32(values, keep, mask=mask)
330359

331360
elif htfunc_t is complex128_t:
332-
return duplicated_complex128(values, keep)
361+
return duplicated_complex128(values, keep, mask=mask)
333362
elif htfunc_t is complex64_t:
334-
return duplicated_complex64(values, keep)
363+
return duplicated_complex64(values, keep, mask=mask)
335364

336365
else:
337366
raise TypeError(values.dtype)

pandas/_libs/lib.pyi

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
5959
def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
6060
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
6161
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
62-
def fast_unique_multiple(arrays: list, sort: bool = ...) -> list: ...
62+
def fast_unique_multiple(left: np.ndarray, right: np.ndarray) -> list: ...
6363
def map_infer(
6464
arr: np.ndarray,
6565
f: Callable[[Any], Any],

0 commit comments

Comments
 (0)