Skip to content

Commit 9ebd536

Browse files
authored
Merge branch 'main' into bug-agg-nonunique-col
2 parents d830964 + 3083ae9 commit 9ebd536

File tree

12 files changed

+96
-37
lines changed

12 files changed

+96
-37
lines changed

.github/workflows/ubuntu.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ jobs:
2828
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
2929
pattern: ["not single_cpu", "single_cpu"]
3030
pyarrow_version: ["8", "9", "10"]
31-
pandas_ci: [1]
3231
include:
3332
- name: "Downstream Compat"
3433
env_file: actions-38-downstream_compat.yaml
@@ -75,7 +74,7 @@ jobs:
7574
test_args: "-W error::DeprecationWarning -W error::FutureWarning"
7675
# TODO(cython3): Re-enable once next-beta(after beta 1) comes out
7776
# There are some warnings failing the build with -werror
78-
pandas_ci: 0
77+
pandas_ci: "0"
7978
exclude:
8079
- env_file: actions-38.yaml
8180
pyarrow_version: "8"
@@ -99,7 +98,7 @@ jobs:
9998
LC_ALL: ${{ matrix.lc_all || '' }}
10099
PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }}
101100
PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
102-
PANDAS_CI: ${{ matrix.pandas_ci }}
101+
PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
103102
TEST_ARGS: ${{ matrix.test_args || '' }}
104103
PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }}
105104
PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}

doc/source/whatsnew/v2.1.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Other enhancements
3636
- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`)
3737
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
3838
- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
39+
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
3940

4041
.. ---------------------------------------------------------------------------
4142
.. _whatsnew_210.notable_bug_fixes:
@@ -236,7 +237,7 @@ Reshaping
236237

237238
Sparse
238239
^^^^^^
239-
-
240+
- Bug in :meth:`arrays.SparseArray.map` allowed the fill value to be included in the sparse values (:issue:`52095`)
240241
-
241242

242243
ExtensionArray

pandas/_libs/lib.pyx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,6 @@ cpdef ndarray[object] ensure_string_array(
752752
out = arr.astype(str).astype(object)
753753
out[arr.isna()] = na_value
754754
return out
755-
756755
arr = arr.to_numpy()
757756
elif not util.is_array(arr):
758757
arr = np.array(arr, dtype="object")

pandas/core/arrays/arrow/array.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2091,7 +2091,10 @@ def _dt_round(
20912091
return self._round_temporally("round", freq, ambiguous, nonexistent)
20922092

20932093
def _dt_to_pydatetime(self):
2094-
return np.array(self._pa_array.to_pylist(), dtype=object)
2094+
data = self._pa_array.to_pylist()
2095+
if self._dtype.pyarrow_dtype.unit == "ns":
2096+
data = [ts.to_pydatetime(warn=False) for ts in data]
2097+
return np.array(data, dtype=object)
20952098

20962099
def _dt_tz_localize(
20972100
self,

pandas/core/arrays/sparse/array.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,23 +1305,24 @@ def map(self, mapper, na_action=None) -> Self:
13051305
IntIndex
13061306
Indices: array([1, 2], dtype=int32)
13071307
"""
1308-
if na_action is not None:
1309-
raise NotImplementedError
1308+
is_map = isinstance(mapper, (abc.Mapping, ABCSeries))
13101309

1311-
# this is used in apply.
1312-
# We get hit since we're an "is_extension_array_dtype" but regular extension
1313-
# types are not hit. This may be worth adding to the interface.
1314-
if isinstance(mapper, ABCSeries):
1315-
mapper = mapper.to_dict()
1310+
fill_val = self.fill_value
13161311

1317-
if isinstance(mapper, abc.Mapping):
1318-
fill_value = mapper.get(self.fill_value, self.fill_value)
1319-
sp_values = [mapper.get(x, None) for x in self.sp_values]
1320-
else:
1321-
fill_value = mapper(self.fill_value)
1322-
sp_values = [mapper(x) for x in self.sp_values]
1312+
if na_action is None or notna(fill_val):
1313+
fill_val = mapper.get(fill_val, fill_val) if is_map else mapper(fill_val)
1314+
1315+
def func(sp_val):
1316+
new_sp_val = mapper.get(sp_val, None) if is_map else mapper(sp_val)
1317+
# check identity and equality because nans are not equal to each other
1318+
if new_sp_val is fill_val or new_sp_val == fill_val:
1319+
msg = "fill value in the sparse values not supported"
1320+
raise ValueError(msg)
1321+
return new_sp_val
1322+
1323+
sp_values = [func(x) for x in self.sp_values]
13231324

1324-
return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
1325+
return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_val)
13251326

13261327
def to_dense(self) -> np.ndarray:
13271328
"""

pandas/core/arrays/string_.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
352352
result[na_values] = libmissing.NA
353353

354354
else:
355+
if hasattr(scalars, "type"):
356+
# pyarrow array
357+
scalars = np.array(scalars)
355358
# convert non-na-likes to str, and nan-likes to StringDtype().na_value
356359
result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
357360

pandas/core/arrays/string_arrow.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
151151
result = scalars._data
152152
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
153153
return cls(pa.array(result, mask=na_values, type=pa.string()))
154+
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
155+
return cls(pc.cast(scalars, pa.string()))
154156

155157
# convert non-na-likes to str
156158
result = lib.ensure_string_array(scalars, copy=copy)

pandas/core/groupby/ops.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
257257
# don't go down a group-by-group path, since in the empty-groups
258258
# case that would fail to raise
259259
raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
260-
if how not in ["rank", "any", "all"]:
260+
if how not in ["rank", "any", "all", "first", "last", "min", "max"]:
261261
# only "rank" is implemented in cython
262262
raise NotImplementedError(f"{dtype} dtype not supported")
263263

@@ -356,11 +356,17 @@ def _ea_wrap_cython_operation(
356356
)
357357

358358
elif isinstance(values, Categorical):
359-
assert self.how in ["rank", "any", "all"]
359+
assert self.how in ["rank", "any", "all", "first", "last", "min", "max"]
360360
mask = values.isna()
361361
if self.how == "rank":
362362
assert values.ordered # checked earlier
363363
npvalues = values._ndarray
364+
elif self.how in ["first", "last", "min", "max"]:
365+
if self.how in ["min", "max"]:
366+
assert values.ordered # checked earlier
367+
npvalues = values._ndarray
368+
result_mask = np.zeros(ngroups, dtype=np.uint8)
369+
kwargs["result_mask"] = result_mask
364370
else:
365371
npvalues = values.astype(bool)
366372

@@ -373,9 +379,9 @@ def _ea_wrap_cython_operation(
373379
**kwargs,
374380
)
375381

376-
# If we ever have more than just "rank" here, we'll need to do
377-
# `if self.how in self.cast_blocklist` like we do for other dtypes.
378-
return res_values
382+
if self.how in self.cast_blocklist:
383+
return res_values
384+
return values._from_backing_data(res_values)
379385

380386
npvalues = self._ea_to_cython_values(values)
381387

pandas/io/sql.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -964,14 +964,16 @@ def insert_data(self) -> tuple[list[str], list[np.ndarray]]:
964964
data_list: list[np.ndarray] = [None] * ncols # type: ignore[list-item]
965965

966966
for i, (_, ser) in enumerate(temp.items()):
967-
vals = ser._values
968-
if vals.dtype.kind == "M":
969-
d = vals.to_pydatetime()
970-
elif vals.dtype.kind == "m":
967+
if ser.dtype.kind == "M":
968+
d = ser.dt.to_pydatetime()
969+
elif ser.dtype.kind == "m":
970+
vals = ser._values
971+
if isinstance(vals, ArrowExtensionArray):
972+
vals = vals.to_numpy(dtype=np.dtype("m8[ns]"))
971973
# store as integers, see GH#6921, GH#7076
972974
d = vals.view("i8").astype(object)
973975
else:
974-
d = vals.astype(object)
976+
d = ser._values.astype(object)
975977

976978
assert isinstance(d, np.ndarray), type(d)
977979

pandas/tests/extension/test_arrow.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2271,6 +2271,7 @@ def test_dt_to_pydatetime():
22712271
result = ser.dt.to_pydatetime()
22722272
expected = np.array(data, dtype=object)
22732273
tm.assert_numpy_array_equal(result, expected)
2274+
assert all(type(res) is datetime for res in result)
22742275

22752276
expected = ser.astype("datetime64[ns]").dt.to_pydatetime()
22762277
tm.assert_numpy_array_equal(result, expected)
@@ -2353,6 +2354,14 @@ def test_concat_empty_arrow_backed_series(dtype):
23532354
tm.assert_series_equal(result, expected)
23542355

23552356

2357+
@pytest.mark.parametrize("dtype", ["string", "string[pyarrow]"])
2358+
def test_series_from_string_array(dtype):
2359+
arr = pa.array("the quick brown fox".split())
2360+
ser = pd.Series(arr, dtype=dtype)
2361+
expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype)
2362+
tm.assert_series_equal(ser, expected)
2363+
2364+
23562365
# _data was renamed to _pa_data
23572366
class OldArrowExtensionArray(ArrowExtensionArray):
23582367
def __getstate__(self):

pandas/tests/extension/test_sparse.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -351,14 +351,27 @@ def test_equals(self, data, na_value, as_series, box):
351351
self._check_unsupported(data)
352352
super().test_equals(data, na_value, as_series, box)
353353

354+
@pytest.mark.parametrize(
355+
"func, na_action, expected",
356+
[
357+
(lambda x: x, None, SparseArray([1.0, np.nan])),
358+
(lambda x: x, "ignore", SparseArray([1.0, np.nan])),
359+
(str, None, SparseArray(["1.0", "nan"], fill_value="nan")),
360+
(str, "ignore", SparseArray(["1.0", np.nan])),
361+
],
362+
)
363+
def test_map(self, func, na_action, expected):
364+
# GH52096
365+
data = SparseArray([1, np.nan])
366+
result = data.map(func, na_action=na_action)
367+
self.assert_extension_array_equal(result, expected)
368+
354369
@pytest.mark.parametrize("na_action", [None, "ignore"])
355-
def test_map(self, data, na_action):
356-
if na_action is not None:
357-
with pytest.raises(NotImplementedError, match=""):
358-
data.map(lambda x: x, na_action=na_action)
359-
else:
360-
result = data.map(lambda x: x, na_action=na_action)
361-
self.assert_extension_array_equal(result, data)
370+
def test_map_raises(self, data, na_action):
371+
# GH52096
372+
msg = "fill value in the sparse values not supported"
373+
with pytest.raises(ValueError, match=msg):
374+
data.map(lambda x: np.nan, na_action=na_action)
362375

363376

364377
class TestCasting(BaseSparseTests, base.BaseCastingTests):

pandas/tests/io/test_sql.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
date,
2525
datetime,
2626
time,
27+
timedelta,
2728
)
2829
from io import StringIO
2930
from pathlib import Path
@@ -549,6 +550,26 @@ def test_dataframe_to_sql(conn, test_frame1, request):
549550
test_frame1.to_sql("test", conn, if_exists="append", index=False)
550551

551552

553+
@pytest.mark.db
554+
@pytest.mark.parametrize("conn", all_connectable)
555+
def test_dataframe_to_sql_arrow_dtypes(conn, request):
556+
# GH 52046
557+
pytest.importorskip("pyarrow")
558+
df = DataFrame(
559+
{
560+
"int": pd.array([1], dtype="int8[pyarrow]"),
561+
"datetime": pd.array(
562+
[datetime(2023, 1, 1)], dtype="timestamp[ns][pyarrow]"
563+
),
564+
"timedelta": pd.array([timedelta(1)], dtype="duration[ns][pyarrow]"),
565+
"string": pd.array(["a"], dtype="string[pyarrow]"),
566+
}
567+
)
568+
conn = request.getfixturevalue(conn)
569+
with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"):
570+
df.to_sql("test_arrow", conn, if_exists="replace", index=False)
571+
572+
552573
@pytest.mark.db
553574
@pytest.mark.parametrize("conn", all_connectable)
554575
@pytest.mark.parametrize("method", [None, "multi"])

0 commit comments

Comments
 (0)