Skip to content

Commit ca3b8fe

Browse files
committed
Merge remote-tracking branch 'upstream/main' into string_dtype_tests
# Conflicts: # doc/source/whatsnew/v2.1.2.rst # pandas/core/arrays/base.py # pandas/core/arrays/string_arrow.py # pandas/tests/frame/methods/test_join.py # pandas/tests/frame/test_constructors.py # pandas/tests/indexes/base_class/test_reshape.py # pandas/tests/indexes/categorical/test_equals.py
2 parents 28f5411 + 8664572 commit ca3b8fe

File tree

36 files changed

+924
-100
lines changed

36 files changed

+924
-100
lines changed

.github/workflows/wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ jobs:
138138
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
139139

140140
- name: Build wheels
141-
uses: pypa/cibuildwheel@v2.16.0
141+
uses: pypa/cibuildwheel@v2.16.1
142142
with:
143143
package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
144144
env:

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ repos:
8484
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
8585
]
8686
- repo: https://github.com/pylint-dev/pylint
87-
rev: v3.0.0a7
87+
rev: v3.0.0b0
8888
hooks:
8989
- id: pylint
9090
stages: [manual]

asv_bench/benchmarks/algorithms.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from importlib import import_module
22

33
import numpy as np
4+
import pyarrow as pa
45

56
import pandas as pd
67

@@ -72,7 +73,16 @@ class Duplicated:
7273
params = [
7374
[True, False],
7475
["first", "last", False],
75-
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
76+
[
77+
"int",
78+
"uint",
79+
"float",
80+
"string",
81+
"datetime64[ns]",
82+
"datetime64[ns, tz]",
83+
"timestamp[ms][pyarrow]",
84+
"duration[s][pyarrow]",
85+
],
7686
]
7787
param_names = ["unique", "keep", "dtype"]
7888

@@ -87,6 +97,12 @@ def setup(self, unique, keep, dtype):
8797
"datetime64[ns, tz]": pd.date_range(
8898
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
8999
),
100+
"timestamp[ms][pyarrow]": pd.Index(
101+
np.arange(N), dtype=pd.ArrowDtype(pa.timestamp("ms"))
102+
),
103+
"duration[s][pyarrow]": pd.Index(
104+
np.arange(N), dtype=pd.ArrowDtype(pa.duration("s"))
105+
),
90106
}[dtype]
91107
if not unique:
92108
data = data.repeat(5)

doc/source/reference/extensions.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ objects.
4949
api.extensions.ExtensionArray.copy
5050
api.extensions.ExtensionArray.view
5151
api.extensions.ExtensionArray.dropna
52+
api.extensions.ExtensionArray.duplicated
5253
api.extensions.ExtensionArray.equals
5354
api.extensions.ExtensionArray.factorize
5455
api.extensions.ExtensionArray.fillna

doc/source/user_guide/10min.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ Merge
451451
Concat
452452
~~~~~~
453453

454-
pandas provides various facilities for easily combining together :class:`Series`` and
454+
pandas provides various facilities for easily combining together :class:`Series` and
455455
:class:`DataFrame` objects with various kinds of set logic for the indexes
456456
and relational algebra functionality in the case of join / merge-type
457457
operations.

doc/source/whatsnew/v2.1.2.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ Fixed regressions
2222

2323
Bug fixes
2424
~~~~~~~~~
25+
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
26+
- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
2527
- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
2628
- Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`)
2729
- Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
28-
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`TODO`)
29-
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`TODO`)
30-
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (issue:`TODO`)
31-
- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`TODO`)
30+
- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
31+
- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
3232
-
3333

3434
.. ---------------------------------------------------------------------------

doc/source/whatsnew/v2.2.0.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676

7777
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7878
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
79+
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
7980
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
8081
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
8182
-
@@ -241,6 +242,7 @@ Performance improvements
241242
- Performance improvement in :meth:`DataFrame.groupby` when aggregating pyarrow timestamp and duration dtypes (:issue:`55031`)
242243
- Performance improvement in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` when indexed by a :class:`MultiIndex` (:issue:`54835`)
243244
- Performance improvement in :meth:`Index.difference` (:issue:`55108`)
245+
- Performance improvement in :meth:`Series.duplicated` for pyarrow dtypes (:issue:`55255`)
244246
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
245247
- Performance improvement when localizing time to UTC (:issue:`55241`)
246248

@@ -281,7 +283,7 @@ Numeric
281283

282284
Conversion
283285
^^^^^^^^^^
284-
-
286+
- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
285287
-
286288

287289
Strings
@@ -310,7 +312,7 @@ Missing
310312

311313
MultiIndex
312314
^^^^^^^^^^
313-
-
315+
- Bug in :meth:`MultiIndex.get_indexer` not raising ``ValueError`` when ``method`` provided and index is non-monotonic (:issue:`53452`)
314316
-
315317

316318
I/O

pandas/_libs/tslibs/offsets.pyx

Lines changed: 114 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4329,28 +4329,6 @@ cdef class CustomBusinessHour(BusinessHour):
43294329

43304330

43314331
cdef class _CustomBusinessMonth(BusinessMixin):
4332-
"""
4333-
DateOffset subclass representing custom business month(s).
4334-
4335-
Increments between beginning/end of month dates.
4336-
4337-
Parameters
4338-
----------
4339-
n : int, default 1
4340-
The number of months represented.
4341-
normalize : bool, default False
4342-
Normalize start/end dates to midnight before generating date range.
4343-
weekmask : str, Default 'Mon Tue Wed Thu Fri'
4344-
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
4345-
holidays : list
4346-
List/array of dates to exclude from the set of valid business days,
4347-
passed to ``numpy.busdaycalendar``.
4348-
calendar : np.busdaycalendar
4349-
Calendar to integrate.
4350-
offset : timedelta, default timedelta(0)
4351-
Time offset to apply.
4352-
"""
4353-
43544332
_attributes = tuple(
43554333
["n", "normalize", "weekmask", "holidays", "calendar", "offset"]
43564334
)
@@ -4426,10 +4404,124 @@ cdef class _CustomBusinessMonth(BusinessMixin):
44264404

44274405

44284406
cdef class CustomBusinessMonthEnd(_CustomBusinessMonth):
4407+
"""
4408+
DateOffset subclass representing custom business month(s).
4409+
4410+
Increments between end of month dates.
4411+
4412+
Parameters
4413+
----------
4414+
n : int, default 1
4415+
The number of months represented.
4416+
normalize : bool, default False
4417+
Normalize end dates to midnight before generating date range.
4418+
weekmask : str, Default 'Mon Tue Wed Thu Fri'
4419+
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
4420+
holidays : list
4421+
List/array of dates to exclude from the set of valid business days,
4422+
passed to ``numpy.busdaycalendar``.
4423+
calendar : np.busdaycalendar
4424+
Calendar to integrate.
4425+
offset : timedelta, default timedelta(0)
4426+
Time offset to apply.
4427+
4428+
See Also
4429+
--------
4430+
:class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
4431+
4432+
Examples
4433+
--------
4434+
In the example below we use the default parameters.
4435+
4436+
>>> ts = pd.Timestamp(2022, 8, 5)
4437+
>>> ts + pd.offsets.CustomBusinessMonthEnd()
4438+
Timestamp('2022-08-31 00:00:00')
4439+
4440+
Custom business month end can be specified by ``weekmask`` parameter.
4441+
To convert the returned datetime object to its string representation
4442+
the function strftime() is used in the next example.
4443+
4444+
>>> import datetime as dt
4445+
>>> freq = pd.offsets.CustomBusinessMonthEnd(weekmask="Wed Thu")
4446+
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18),
4447+
... freq=freq).strftime('%a %d %b %Y %H:%M')
4448+
Index(['Thu 28 Jul 2022 00:00', 'Wed 31 Aug 2022 00:00',
4449+
'Thu 29 Sep 2022 00:00', 'Thu 27 Oct 2022 00:00',
4450+
'Wed 30 Nov 2022 00:00'],
4451+
dtype='object')
4452+
4453+
Using NumPy business day calendar you can define custom holidays.
4454+
4455+
>>> import datetime as dt
4456+
>>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30',
4457+
... '2022-10-31', '2022-11-01'])
4458+
>>> freq = pd.offsets.CustomBusinessMonthEnd(calendar=bdc)
4459+
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq)
4460+
DatetimeIndex(['2022-07-29', '2022-08-31', '2022-09-29', '2022-10-28'],
4461+
dtype='datetime64[ns]', freq='CBM')
4462+
"""
4463+
44294464
_prefix = "CBM"
44304465

44314466

44324467
cdef class CustomBusinessMonthBegin(_CustomBusinessMonth):
4468+
"""
4469+
DateOffset subclass representing custom business month(s).
4470+
4471+
Increments between beginning of month dates.
4472+
4473+
Parameters
4474+
----------
4475+
n : int, default 1
4476+
The number of months represented.
4477+
normalize : bool, default False
4478+
Normalize start dates to midnight before generating date range.
4479+
weekmask : str, Default 'Mon Tue Wed Thu Fri'
4480+
Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
4481+
holidays : list
4482+
List/array of dates to exclude from the set of valid business days,
4483+
passed to ``numpy.busdaycalendar``.
4484+
calendar : np.busdaycalendar
4485+
Calendar to integrate.
4486+
offset : timedelta, default timedelta(0)
4487+
Time offset to apply.
4488+
4489+
See Also
4490+
--------
4491+
:class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
4492+
4493+
Examples
4494+
--------
4495+
In the example below we use the default parameters.
4496+
4497+
>>> ts = pd.Timestamp(2022, 8, 5)
4498+
>>> ts + pd.offsets.CustomBusinessMonthBegin()
4499+
Timestamp('2022-09-01 00:00:00')
4500+
4501+
Custom business month start can be specified by ``weekmask`` parameter.
4502+
To convert the returned datetime object to its string representation
4503+
the function strftime() is used in the next example.
4504+
4505+
>>> import datetime as dt
4506+
>>> freq = pd.offsets.CustomBusinessMonthBegin(weekmask="Wed Thu")
4507+
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 12, 18),
4508+
... freq=freq).strftime('%a %d %b %Y %H:%M')
4509+
Index(['Wed 03 Aug 2022 00:00', 'Thu 01 Sep 2022 00:00',
4510+
'Wed 05 Oct 2022 00:00', 'Wed 02 Nov 2022 00:00',
4511+
'Thu 01 Dec 2022 00:00'],
4512+
dtype='object')
4513+
4514+
Using NumPy business day calendar you can define custom holidays.
4515+
4516+
>>> import datetime as dt
4517+
>>> bdc = np.busdaycalendar(holidays=['2022-08-01', '2022-09-30',
4518+
... '2022-10-31', '2022-11-01'])
4519+
>>> freq = pd.offsets.CustomBusinessMonthBegin(calendar=bdc)
4520+
>>> pd.date_range(dt.datetime(2022, 7, 10), dt.datetime(2022, 11, 10), freq=freq)
4521+
DatetimeIndex(['2022-08-02', '2022-09-01', '2022-10-03', '2022-11-02'],
4522+
dtype='datetime64[ns]', freq='CBMS')
4523+
"""
4524+
44334525
_prefix = "CBMS"
44344526

44354527

pandas/_typing.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
Hashable,
55
Iterator,
66
Mapping,
7+
MutableMapping,
78
Sequence,
89
)
910
from datetime import (
@@ -103,6 +104,7 @@
103104
TypeGuard: Any = None
104105

105106
HashableT = TypeVar("HashableT", bound=Hashable)
107+
MutableMappingT = TypeVar("MutableMappingT", bound=MutableMapping)
106108

107109
# array-like
108110

pandas/core/algorithms.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@
5555
)
5656
from pandas.core.dtypes.concat import concat_compat
5757
from pandas.core.dtypes.dtypes import (
58-
ArrowDtype,
5958
BaseMaskedDtype,
6059
CategoricalDtype,
6160
ExtensionDtype,
@@ -979,36 +978,32 @@ def value_counts_arraylike(
979978

980979

981980
def duplicated(
982-
values: ArrayLike, keep: Literal["first", "last", False] = "first"
981+
values: ArrayLike,
982+
keep: Literal["first", "last", False] = "first",
983+
mask: npt.NDArray[np.bool_] | None = None,
983984
) -> npt.NDArray[np.bool_]:
984985
"""
985986
Return boolean ndarray denoting duplicate values.
986987
987988
Parameters
988989
----------
989-
values : nd.array, ExtensionArray or Series
990+
values : np.ndarray or ExtensionArray
990991
Array over which to check for duplicate values.
991992
keep : {'first', 'last', False}, default 'first'
992993
- ``first`` : Mark duplicates as ``True`` except for the first
993994
occurrence.
994995
- ``last`` : Mark duplicates as ``True`` except for the last
995996
occurrence.
996997
- False : Mark all duplicates as ``True``.
998+
mask : ndarray[bool], optional
999+
array indicating which elements to exclude from checking
9971000
9981001
Returns
9991002
-------
10001003
duplicated : ndarray[bool]
10011004
"""
1002-
if hasattr(values, "dtype"):
1003-
if isinstance(values.dtype, ArrowDtype) and values.dtype.kind in "ifub":
1004-
values = values._to_masked() # type: ignore[union-attr]
1005-
1006-
if isinstance(values.dtype, BaseMaskedDtype):
1007-
values = cast("BaseMaskedArray", values)
1008-
return htable.duplicated(values._data, keep=keep, mask=values._mask)
1009-
10101005
values = _ensure_data(values)
1011-
return htable.duplicated(values, keep=keep)
1006+
return htable.duplicated(values, keep=keep, mask=mask)
10121007

10131008

10141009
def mode(

0 commit comments

Comments
 (0)