Skip to content

Commit 74972b7

Browse files
committed
Merge remote-tracking branch 'upstream/main' into feat/cleanup_docs
2 parents 46810e9 + cf7f0af commit 74972b7

File tree

19 files changed

+190
-167
lines changed

19 files changed

+190
-167
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ Other enhancements
265265
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to ``lzma.LZMAFile`` (:issue:`52979`)
266266
- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`)
267267
- :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`)
268+
- :meth:`Series.cummax`, :meth:`Series.cummin` and :meth:`Series.cumprod` are now supported for pyarrow dtypes with pyarrow version 13.0 and above (:issue:`52085`)
268269
- Added support for the DataFrame Consortium Standard (:issue:`54383`)
269270
- Performance improvement in :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` (:issue:`51722`)
270271

pandas/_libs/src/vendored/ujson/python/objToJSON.c

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,6 +1318,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
13181318
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
13191319
is_datetimelike = 1;
13201320
if (PyObject_HasAttrString(item, "_value")) {
1321+
// pd.Timestamp object or pd.NaT
13211322
// see test_date_index_and_values for case with non-nano
13221323
i8date = get_long_attr(item, "_value");
13231324
} else {
@@ -1471,12 +1472,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
14711472
}
14721473
// Currently no way to pass longVal to iso function, so use
14731474
// state management
1474-
GET_TC(tc)->longValue = longVal;
1475+
pc->longValue = longVal;
14751476
tc->type = JT_UTF8;
14761477
} else {
14771478
NPY_DATETIMEUNIT base =
14781479
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
1479-
GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base);
1480+
pc->longValue = NpyDateTimeToEpoch(longVal, base);
14801481
tc->type = JT_LONG;
14811482
}
14821483
}
@@ -1497,9 +1498,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
14971498
if (PyLong_Check(obj)) {
14981499
tc->type = JT_LONG;
14991500
int overflow = 0;
1500-
GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
1501+
pc->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
15011502
int err;
1502-
err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred();
1503+
err = (pc->longValue == -1) && PyErr_Occurred();
15031504

15041505
if (overflow) {
15051506
tc->type = JT_BIGNUM;
@@ -1513,7 +1514,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
15131514
if (npy_isnan(val) || npy_isinf(val)) {
15141515
tc->type = JT_NULL;
15151516
} else {
1516-
GET_TC(tc)->doubleValue = val;
1517+
pc->doubleValue = val;
15171518
tc->type = JT_DOUBLE;
15181519
}
15191520
return;
@@ -1526,7 +1527,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
15261527
tc->type = JT_UTF8;
15271528
return;
15281529
} else if (object_is_decimal_type(obj)) {
1529-
GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj);
1530+
pc->doubleValue = PyFloat_AsDouble(obj);
15301531
tc->type = JT_DOUBLE;
15311532
return;
15321533
} else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
@@ -1541,7 +1542,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
15411542
} else {
15421543
NPY_DATETIMEUNIT base =
15431544
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
1544-
GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
1545+
pc->longValue = PyDateTimeToEpoch(obj, base);
15451546
tc->type = JT_LONG;
15461547
}
15471548
return;
@@ -1573,12 +1574,13 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
15731574
} else {
15741575
NPY_DATETIMEUNIT base =
15751576
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
1576-
GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
1577+
pc->longValue = PyDateTimeToEpoch(obj, base);
15771578
tc->type = JT_LONG;
15781579
}
15791580
return;
15801581
} else if (PyDelta_Check(obj)) {
15811582
if (PyObject_HasAttrString(obj, "_value")) {
1583+
// pd.Timedelta object or pd.NaT
15821584
value = get_long_attr(obj, "_value");
15831585
} else {
15841586
value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec
@@ -1604,11 +1606,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
16041606

16051607
tc->type = JT_LONG;
16061608
}
1607-
GET_TC(tc)->longValue = value;
1609+
pc->longValue = value;
16081610
return;
16091611
} else if (PyArray_IsScalar(obj, Integer)) {
16101612
tc->type = JT_LONG;
1611-
PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
1613+
PyArray_CastScalarToCtype(obj, &(pc->longValue),
16121614
PyArray_DescrFromType(NPY_INT64));
16131615

16141616
exc = PyErr_Occurred();
@@ -1619,12 +1621,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
16191621

16201622
return;
16211623
} else if (PyArray_IsScalar(obj, Bool)) {
1622-
PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
1624+
PyArray_CastScalarToCtype(obj, &(pc->longValue),
16231625
PyArray_DescrFromType(NPY_BOOL));
1624-
tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE;
1626+
tc->type = (pc->longValue) ? JT_TRUE : JT_FALSE;
16251627
return;
16261628
} else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) {
1627-
PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue),
1629+
PyArray_CastScalarToCtype(obj, &(pc->doubleValue),
16281630
PyArray_DescrFromType(NPY_DOUBLE));
16291631
tc->type = JT_DOUBLE;
16301632
return;

pandas/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,3 +1996,8 @@ def warsaw(request) -> str:
19961996
tzinfo for Europe/Warsaw using pytz, dateutil, or zoneinfo.
19971997
"""
19981998
return request.param
1999+
2000+
2001+
@pytest.fixture()
2002+
def arrow_string_storage():
2003+
return ("pyarrow",)

pandas/core/arrays/arrow/array.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1389,6 +1389,9 @@ def _accumulate(
13891389
NotImplementedError : subclass does not define accumulations
13901390
"""
13911391
pyarrow_name = {
1392+
"cummax": "cumulative_max",
1393+
"cummin": "cumulative_min",
1394+
"cumprod": "cumulative_prod_checked",
13921395
"cumsum": "cumulative_sum_checked",
13931396
}.get(name, name)
13941397
pyarrow_meth = getattr(pc, pyarrow_name, None)
@@ -1398,12 +1401,20 @@ def _accumulate(
13981401
data_to_accum = self._pa_array
13991402

14001403
pa_dtype = data_to_accum.type
1401-
if pa.types.is_duration(pa_dtype):
1402-
data_to_accum = data_to_accum.cast(pa.int64())
1404+
1405+
convert_to_int = (
1406+
pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"]
1407+
) or (pa.types.is_duration(pa_dtype) and name == "cumsum")
1408+
1409+
if convert_to_int:
1410+
if pa_dtype.bit_width == 32:
1411+
data_to_accum = data_to_accum.cast(pa.int32())
1412+
else:
1413+
data_to_accum = data_to_accum.cast(pa.int64())
14031414

14041415
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
14051416

1406-
if pa.types.is_duration(pa_dtype):
1417+
if convert_to_int:
14071418
result = result.cast(pa_dtype)
14081419

14091420
return type(self)(result)

pandas/io/excel/_odswriter.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from collections import defaultdict
44
import datetime
5+
import json
56
from typing import (
67
TYPE_CHECKING,
78
Any,
@@ -10,8 +11,6 @@
1011
overload,
1112
)
1213

13-
from pandas._libs import json
14-
1514
from pandas.io.excel._base import ExcelWriter
1615
from pandas.io.excel._util import (
1716
combine_kwargs,
@@ -257,7 +256,7 @@ def _process_style(self, style: dict[str, Any] | None) -> str | None:
257256

258257
if style is None:
259258
return None
260-
style_key = json.ujson_dumps(style)
259+
style_key = json.dumps(style)
261260
if style_key in self._style_dict:
262261
return self._style_dict[style_key]
263262
name = f"pd{len(self._style_dict)+1}"

pandas/io/excel/_xlsxwriter.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from __future__ import annotations
22

3+
import json
34
from typing import (
45
TYPE_CHECKING,
56
Any,
67
)
78

8-
from pandas._libs import json
9-
109
from pandas.io.excel._base import ExcelWriter
1110
from pandas.io.excel._util import (
1211
combine_kwargs,
@@ -262,7 +261,7 @@ def _write_cells(
262261
for cell in cells:
263262
val, fmt = self._value_with_fmt(cell.val)
264263

265-
stylekey = json.ujson_dumps(cell.style)
264+
stylekey = json.dumps(cell.style)
266265
if fmt:
267266
stylekey += fmt
268267

pandas/io/json/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from pandas.io.json._json import (
22
read_json,
33
to_json,
4-
ujson_dumps as dumps,
5-
ujson_loads as loads,
4+
ujson_dumps,
5+
ujson_loads,
66
)
77
from pandas.io.json._table_schema import build_table_schema
88

99
__all__ = [
10-
"dumps",
11-
"loads",
10+
"ujson_dumps",
11+
"ujson_loads",
1212
"read_json",
1313
"to_json",
1414
"build_table_schema",

pandas/tests/arrays/string_/test_string.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ def test_add(dtype):
115115
tm.assert_series_equal(result, expected)
116116

117117

118-
def test_add_2d(dtype, request):
119-
if dtype.storage == "pyarrow":
118+
def test_add_2d(dtype, request, arrow_string_storage):
119+
if dtype.storage in arrow_string_storage:
120120
reason = "Failed: DID NOT RAISE <class 'ValueError'>"
121121
mark = pytest.mark.xfail(raises=None, reason=reason)
122122
request.node.add_marker(mark)
@@ -144,8 +144,8 @@ def test_add_sequence(dtype):
144144
tm.assert_extension_array_equal(result, expected)
145145

146146

147-
def test_mul(dtype, request):
148-
if dtype.storage == "pyarrow":
147+
def test_mul(dtype, request, arrow_string_storage):
148+
if dtype.storage in arrow_string_storage:
149149
reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'"
150150
mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason)
151151
request.node.add_marker(mark)
@@ -369,8 +369,8 @@ def test_min_max(method, skipna, dtype, request):
369369

370370
@pytest.mark.parametrize("method", ["min", "max"])
371371
@pytest.mark.parametrize("box", [pd.Series, pd.array])
372-
def test_min_max_numpy(method, box, dtype, request):
373-
if dtype.storage == "pyarrow" and box is pd.array:
372+
def test_min_max_numpy(method, box, dtype, request, arrow_string_storage):
373+
if dtype.storage in arrow_string_storage and box is pd.array:
374374
if box is pd.array:
375375
reason = "'<=' not supported between instances of 'str' and 'NoneType'"
376376
else:
@@ -384,7 +384,7 @@ def test_min_max_numpy(method, box, dtype, request):
384384
assert result == expected
385385

386386

387-
def test_fillna_args(dtype, request):
387+
def test_fillna_args(dtype, request, arrow_string_storage):
388388
# GH 37987
389389

390390
arr = pd.array(["a", pd.NA], dtype=dtype)
@@ -397,7 +397,7 @@ def test_fillna_args(dtype, request):
397397
expected = pd.array(["a", "b"], dtype=dtype)
398398
tm.assert_extension_array_equal(res, expected)
399399

400-
if dtype.storage == "pyarrow":
400+
if dtype.storage in arrow_string_storage:
401401
msg = "Invalid value '1' for dtype string"
402402
else:
403403
msg = "Cannot set non-string value '1' into a StringArray."
@@ -503,10 +503,10 @@ def test_use_inf_as_na(values, expected, dtype):
503503
tm.assert_frame_equal(result, expected)
504504

505505

506-
def test_memory_usage(dtype):
506+
def test_memory_usage(dtype, arrow_string_storage):
507507
# GH 33963
508508

509-
if dtype.storage == "pyarrow":
509+
if dtype.storage in arrow_string_storage:
510510
pytest.skip(f"not applicable for {dtype.storage}")
511511

512512
series = pd.Series(["a", "b", "c"], dtype=dtype)

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ def test_config_bad_storage_raises():
4949
@skip_if_no_pyarrow
5050
@pytest.mark.parametrize("chunked", [True, False])
5151
@pytest.mark.parametrize("array", ["numpy", "pyarrow"])
52-
def test_constructor_not_string_type_raises(array, chunked):
52+
def test_constructor_not_string_type_raises(array, chunked, arrow_string_storage):
5353
import pyarrow as pa
5454

55-
array = pa if array == "pyarrow" else np
55+
array = pa if array in arrow_string_storage else np
5656

5757
arr = array.array([1, 2, 3])
5858
if chunked:

pandas/tests/extension/base/ops.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,23 @@ def test_compare_array(self, data, comparison_op):
239239
class BaseUnaryOpsTests(BaseOpsUtil):
240240
def test_invert(self, data):
241241
ser = pd.Series(data, name="name")
242-
result = ~ser
243-
expected = pd.Series(~data, name="name")
244-
tm.assert_series_equal(result, expected)
242+
try:
243+
# 10 is an arbitrary choice here, just avoid iterating over
244+
# the whole array to trim test runtime
245+
[~x for x in data[:10]]
246+
except TypeError:
247+
# scalars don't support invert -> we don't expect the vectorized
248+
# operation to succeed
249+
with pytest.raises(TypeError):
250+
~ser
251+
with pytest.raises(TypeError):
252+
~data
253+
else:
254+
# Note we do not re-use the pointwise result to construct expected
255+
# because python semantics for negating bools are weird see GH#54569
256+
result = ~ser
257+
expected = pd.Series(~data, name="name")
258+
tm.assert_series_equal(result, expected)
245259

246260
@pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
247261
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):

0 commit comments

Comments
 (0)