Skip to content

Commit e987daf

Browse files
committed
Merge remote-tracking branch 'upstream/main' into bitmask-backed
2 parents e08a647 + ca42994 commit e987daf

File tree

23 files changed

+196
-80
lines changed

23 files changed

+196
-80
lines changed

.github/workflows/code-checks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ jobs:
124124
run: |
125125
cd asv_bench
126126
asv machine --yes
127-
asv run --quick --dry-run --strict --durations=30 --python=same
127+
asv run --quick --dry-run --durations=30 --python=same
128128
129129
build_docker_dev_environment:
130130
name: Build Docker Dev Environment

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,7 @@ Reshaping
816816
- Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`)
817817
- Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`)
818818
- Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`)
819+
- Bug in :func:`merge` when merging on integer ``ExtensionDtype`` and float NumPy dtype raising ``TypeError`` (:issue:`46178`)
819820
- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
820821
- Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`)
821822
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ Deprecations
153153

154154
Performance improvements
155155
~~~~~~~~~~~~~~~~~~~~~~~~
156-
-
156+
- Performance improvement when indexing with more than 4 keys (:issue:`54550`)
157157
-
158158

159159
.. ---------------------------------------------------------------------------

meson.build

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,19 @@
22
project(
33
'pandas',
44
'c', 'cpp', 'cython',
5-
version: run_command(['python', 'generate_version.py', '--print'], check: true).stdout().strip(),
5+
version: run_command(['python3', 'generate_version.py', '--print'], check: true).stdout().strip(),
66
license: 'BSD-3',
77
meson_version: '>=1.0.1',
88
default_options: [
9-
# TODO: investigate, does meson try to compile against debug Python
10-
# when buildtype = debug, this seems to be causing problems on CI
11-
# where provided Python is not compiled in debug mode
129
'buildtype=release',
1310
# TODO: Reactivate werror, some warnings on Windows
1411
#'werror=true',
1512
'c_std=c99'
1613
]
1714
)
1815

19-
py_mod = import('python')
2016
fs = import('fs')
21-
py = py_mod.find_installation('python')
22-
py_dep = py.dependency()
17+
py = import('python').find_installation()
2318
tempita = files('generate_pxi.py')
2419
versioneer = files('generate_version.py')
2520

pandas/_libs/index.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ cdef class IndexEngine:
375375
# map each starget to its position in the index
376376
if (
377377
stargets and
378-
len(stargets) < 5 and
378+
len(stargets) < (n / (2 * n.bit_length())) and
379379
not na_in_stargets and
380380
self.is_monotonic_increasing
381381
):

pandas/_libs/lib.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ def array_equivalent_object(
196196
right: npt.NDArray[np.object_],
197197
) -> bool: ...
198198
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
199+
def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:]
199200
def get_reverse_indexer(
200201
indexer: np.ndarray, # const intp_t[:]
201202
length: int,

pandas/_libs/lib.pyx

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,22 @@ def has_infs(floating[:] arr) -> bool:
532532
return ret
533533

534534

535+
@cython.boundscheck(False)
536+
@cython.wraparound(False)
537+
def has_only_ints_or_nan(floating[:] arr) -> bool:
538+
cdef:
539+
floating val
540+
intp_t i
541+
542+
for i in range(len(arr)):
543+
val = arr[i]
544+
if (val != val) or (val == <int64_t>val):
545+
continue
546+
else:
547+
return False
548+
return True
549+
550+
535551
def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
536552
cdef:
537553
Py_ssize_t i, n = len(indices)

pandas/_libs/window/meson.build

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ py.extension_module(
33
['aggregations.pyx'],
44
cython_args: ['-X always_allow_keywords=true'],
55
include_directories: [inc_np, inc_pd],
6-
dependencies: [py_dep],
76
subdir: 'pandas/_libs/window',
87
override_options : ['cython_language=cpp'],
98
install: true
@@ -14,7 +13,6 @@ py.extension_module(
1413
['indexers.pyx'],
1514
cython_args: ['-X always_allow_keywords=true'],
1615
include_directories: [inc_np, inc_pd],
17-
dependencies: [py_dep],
1816
subdir: 'pandas/_libs/window',
1917
install: true
2018
)

pandas/core/arrays/string_.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
missing as libmissing,
1616
)
1717
from pandas._libs.arrays import NDArrayBacked
18+
from pandas._libs.lib import ensure_string_array
1819
from pandas.compat import pa_version_under7p0
1920
from pandas.compat.numpy import function as nv
2021
from pandas.util._decorators import doc
@@ -224,7 +225,7 @@ def __from_arrow__(
224225
arr = np.array([], dtype=object)
225226
else:
226227
arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
227-
arr = lib.convert_nans_to_NA(arr)
228+
arr = ensure_string_array(arr, na_value=libmissing.NA)
228229
# Bypass validation inside StringArray constructor, see GH#47781
229230
new_string_array = StringArray.__new__(StringArray)
230231
NDArrayBacked.__init__(

pandas/core/arrays/string_arrow.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -554,3 +554,16 @@ def value_counts(self, dropna: bool = True):
554554
return Series(
555555
result._values.to_numpy(), index=result.index, name=result.name, copy=False
556556
)
557+
558+
def _reduce(
559+
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
560+
):
561+
if name in ["any", "all"]:
562+
arr = pc.and_kleene(
563+
pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "")
564+
)
565+
return ArrowExtensionArray(arr)._reduce(
566+
name, skipna=skipna, keepdims=keepdims, **kwargs
567+
)
568+
else:
569+
return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7279,7 +7279,7 @@ def value_counts(
72797279
subset = self.columns.tolist()
72807280

72817281
name = "proportion" if normalize else "count"
7282-
counts = self.groupby(subset, dropna=dropna).grouper.size()
7282+
counts = self.groupby(subset, dropna=dropna, observed=False).grouper.size()
72837283
counts.name = name
72847284

72857285
if sort:

pandas/core/internals/blocks.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pandas._config import using_copy_on_write
1919

2020
from pandas._libs import (
21+
NaT,
2122
internals as libinternals,
2223
lib,
2324
writers,
@@ -59,7 +60,10 @@
5960
from pandas.core.dtypes.common import (
6061
ensure_platform_int,
6162
is_1d_only_ea_dtype,
63+
is_float_dtype,
64+
is_integer_dtype,
6265
is_list_like,
66+
is_scalar,
6367
is_string_dtype,
6468
)
6569
from pandas.core.dtypes.dtypes import (
@@ -453,6 +457,25 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
453457
and will receive the same block
454458
"""
455459
new_dtype = find_result_type(self.values.dtype, other)
460+
461+
# In a future version of pandas, the default will be that
462+
# setting `nan` into an integer series won't raise.
463+
if (
464+
is_scalar(other)
465+
and is_integer_dtype(self.values.dtype)
466+
and isna(other)
467+
and other is not NaT
468+
):
469+
warn_on_upcast = False
470+
elif (
471+
isinstance(other, np.ndarray)
472+
and other.ndim == 1
473+
and is_integer_dtype(self.values.dtype)
474+
and is_float_dtype(other.dtype)
475+
and lib.has_only_ints_or_nan(other)
476+
):
477+
warn_on_upcast = False
478+
456479
if warn_on_upcast:
457480
warnings.warn(
458481
f"Setting an item of incompatible dtype is deprecated "

pandas/core/reshape/merge.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
ensure_object,
5454
is_bool,
5555
is_bool_dtype,
56+
is_extension_array_dtype,
5657
is_float_dtype,
5758
is_integer,
5859
is_integer_dtype,
@@ -1385,6 +1386,21 @@ def _maybe_coerce_merge_keys(self) -> None:
13851386
if lk.dtype.kind == rk.dtype.kind:
13861387
continue
13871388

1389+
if is_extension_array_dtype(lk.dtype) and not is_extension_array_dtype(
1390+
rk.dtype
1391+
):
1392+
ct = find_common_type([lk.dtype, rk.dtype])
1393+
if is_extension_array_dtype(ct):
1394+
rk = ct.construct_array_type()._from_sequence(rk) # type: ignore[union-attr] # noqa: E501
1395+
else:
1396+
rk = rk.astype(ct) # type: ignore[arg-type]
1397+
elif is_extension_array_dtype(rk.dtype):
1398+
ct = find_common_type([lk.dtype, rk.dtype])
1399+
if is_extension_array_dtype(ct):
1400+
lk = ct.construct_array_type()._from_sequence(lk) # type: ignore[union-attr] # noqa: E501
1401+
else:
1402+
lk = lk.astype(ct) # type: ignore[arg-type]
1403+
13881404
# check whether ints and floats
13891405
if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
13901406
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

pandas/tests/extension/test_string.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,11 @@ def test_fillna_no_op_returns_copy(self, data):
158158

159159
class TestReduce(base.BaseReduceTests):
160160
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
161-
return op_name in ["min", "max"]
161+
return (
162+
op_name in ["min", "max"]
163+
or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr]
164+
and op_name in ("any", "all")
165+
)
162166

163167

164168
class TestMethods(base.BaseMethodsTests):

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write):
337337
def test_setitem2(self):
338338
# dtype changing GH4204
339339
df = DataFrame([[0, 0]])
340-
with tm.assert_produces_warning(
341-
FutureWarning, match="Setting an item of incompatible dtype"
342-
):
343-
df.iloc[0] = np.nan
340+
df.iloc[0] = np.nan
344341
expected = DataFrame([[np.nan, np.nan]])
345342
tm.assert_frame_equal(df, expected)
346343

347344
df = DataFrame([[0, 0]])
348-
with tm.assert_produces_warning(
349-
FutureWarning, match="Setting an item of incompatible dtype"
350-
):
351-
df.loc[0] = np.nan
345+
df.loc[0] = np.nan
352346
tm.assert_frame_equal(df, expected)
353347

354348
def test_setitem_boolean(self, float_frame):
@@ -1579,9 +1573,7 @@ def test_setitem(self, uint64_frame):
15791573
# With NaN: because uint64 has no NaN element,
15801574
# the column should be cast to object.
15811575
df2 = df.copy()
1582-
with tm.assert_produces_warning(
1583-
FutureWarning, match="Setting an item of incompatible dtype"
1584-
):
1576+
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
15851577
df2.iloc[1, 1] = pd.NaT
15861578
df2.iloc[1, 2] = pd.NaT
15871579
result = df2["B"]
@@ -1901,19 +1893,19 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key):
19011893
class TestSetitemValidation:
19021894
# This is adapted from pandas/tests/arrays/masked/test_indexing.py
19031895
# but checks for warnings instead of errors.
1904-
def _check_setitem_invalid(self, df, invalid, indexer):
1896+
def _check_setitem_invalid(self, df, invalid, indexer, warn):
19051897
msg = "Setting an item of incompatible dtype is deprecated"
19061898
msg = re.escape(msg)
19071899

19081900
orig_df = df.copy()
19091901

19101902
# iloc
1911-
with tm.assert_produces_warning(FutureWarning, match=msg):
1903+
with tm.assert_produces_warning(warn, match=msg):
19121904
df.iloc[indexer, 0] = invalid
19131905
df = orig_df.copy()
19141906

19151907
# loc
1916-
with tm.assert_produces_warning(FutureWarning, match=msg):
1908+
with tm.assert_produces_warning(warn, match=msg):
19171909
df.loc[indexer, "a"] = invalid
19181910
df = orig_df.copy()
19191911

@@ -1934,16 +1926,20 @@ def _check_setitem_invalid(self, df, invalid, indexer):
19341926
@pytest.mark.parametrize("indexer", _indexers)
19351927
def test_setitem_validation_scalar_bool(self, invalid, indexer):
19361928
df = DataFrame({"a": [True, False, False]}, dtype="bool")
1937-
self._check_setitem_invalid(df, invalid, indexer)
1929+
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)
19381930

19391931
@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
19401932
@pytest.mark.parametrize("indexer", _indexers)
19411933
def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer):
19421934
df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype)
1943-
self._check_setitem_invalid(df, invalid, indexer)
1935+
if isna(invalid) and invalid is not pd.NaT:
1936+
warn = None
1937+
else:
1938+
warn = FutureWarning
1939+
self._check_setitem_invalid(df, invalid, indexer, warn)
19441940

19451941
@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
19461942
@pytest.mark.parametrize("indexer", _indexers)
19471943
def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer):
19481944
df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype)
1949-
self._check_setitem_invalid(df, invalid, indexer)
1945+
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)

pandas/tests/frame/methods/test_value_counts.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,17 @@ def test_data_frame_value_counts_subset(nulls_fixture, columns):
175175
)
176176

177177
tm.assert_series_equal(result, expected)
178+
179+
180+
def test_value_counts_categorical_future_warning():
181+
# GH#54775
182+
df = pd.DataFrame({"a": [1, 2, 3]}, dtype="category")
183+
result = df.value_counts()
184+
expected = pd.Series(
185+
1,
186+
index=pd.MultiIndex.from_arrays(
187+
[pd.Index([1, 2, 3], name="a", dtype="category")]
188+
),
189+
name="count",
190+
)
191+
tm.assert_series_equal(result, expected)

pandas/tests/indexing/test_indexing.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected):
830830
start_data, expected_result, warn = expected
831831

832832
start_dataframe = DataFrame({"foo": start_data})
833-
with tm.assert_produces_warning(warn, match="incompatible dtype"):
834-
start_dataframe.loc[0, ["foo"]] = None
833+
start_dataframe.loc[0, ["foo"]] = None
835834

836835
expected_dataframe = DataFrame({"foo": expected_result})
837836
tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected):
841840
start_data, expected_result, warn = expected
842841

843842
start_dataframe = DataFrame({"foo": start_data})
844-
with tm.assert_produces_warning(warn, match="incompatible dtype"):
845-
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
843+
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
846844

847845
expected_dataframe = DataFrame({"foo": expected_result})
848846
tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected):
852850
start_data, expected_result, warn = expected
853851

854852
start_dataframe = DataFrame({"foo": start_data})
855-
with tm.assert_produces_warning(warn, match="incompatible dtype"):
856-
start_dataframe.loc[
857-
start_dataframe["foo"] == start_dataframe["foo"][0]
858-
] = None
853+
start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
859854

860855
expected_dataframe = DataFrame({"foo": expected_result})
861856
tm.assert_frame_equal(start_dataframe, expected_dataframe)
@@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self):
869864
"d": ["a", "b", "c"],
870865
}
871866
)
872-
with tm.assert_produces_warning(
873-
FutureWarning, match="item of incompatible dtype"
874-
):
875-
start_dataframe.iloc[0] = None
867+
start_dataframe.iloc[0] = None
876868

877869
exp = DataFrame(
878870
{

0 commit comments

Comments
 (0)