diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index df8ac3f3b0696..2447d456b7478 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -61,52 +61,39 @@ def nancorr_spearman( # ---------------------------------------------------------------------- -# ctypedef fused algos_t: -# float64_t -# float32_t -# object -# int64_t -# int32_t -# int16_t -# int8_t -# uint64_t -# uint32_t -# uint16_t -# uint8_t - def validate_limit(nobs: int | None, limit=...) -> int: ... def pad( - old: np.ndarray, # ndarray[algos_t] - new: np.ndarray, # ndarray[algos_t] + old: np.ndarray, # ndarray[numeric_object_t] + new: np.ndarray, # ndarray[numeric_object_t] limit=..., ) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1] def pad_inplace( - values: np.ndarray, # algos_t[:] + values: np.ndarray, # numeric_object_t[:] mask: np.ndarray, # uint8_t[:] limit=..., ) -> None: ... def pad_2d_inplace( - values: np.ndarray, # algos_t[:, :] + values: np.ndarray, # numeric_object_t[:, :] mask: np.ndarray, # const uint8_t[:, :] limit=..., ) -> None: ... def backfill( - old: np.ndarray, # ndarray[algos_t] - new: np.ndarray, # ndarray[algos_t] + old: np.ndarray, # ndarray[numeric_object_t] + new: np.ndarray, # ndarray[numeric_object_t] limit=..., ) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1] def backfill_inplace( - values: np.ndarray, # algos_t[:] + values: np.ndarray, # numeric_object_t[:] mask: np.ndarray, # uint8_t[:] limit=..., ) -> None: ... def backfill_2d_inplace( - values: np.ndarray, # algos_t[:, :] + values: np.ndarray, # numeric_object_t[:, :] mask: np.ndarray, # const uint8_t[:, :] limit=..., ) -> None: ... def is_monotonic( - arr: np.ndarray, # ndarray[algos_t, ndim=1] + arr: np.ndarray, # ndarray[numeric_object_t, ndim=1] timelike: bool, ) -> tuple[bool, bool, bool]: ... @@ -114,14 +101,8 @@ def is_monotonic( # rank_1d, rank_2d # ---------------------------------------------------------------------- -# ctypedef fused rank_t: -# object -# float64_t -# uint64_t -# int64_t - def rank_1d( - values: np.ndarray, # ndarray[rank_t, ndim=1] + values: np.ndarray, # ndarray[iu_64_floating_obj_t, ndim=1] labels: np.ndarray | None = ..., # const int64_t[:]=None is_datetimelike: bool = ..., ties_method=..., @@ -130,7 +111,7 @@ def rank_1d( na_option=..., ) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] def rank_2d( - in_arr: np.ndarray, # ndarray[rank_t, ndim=2] + in_arr: np.ndarray, # ndarray[iu_64_floating_obj_t, ndim=2] axis: int = ..., is_datetimelike: bool = ..., ties_method=..., diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c0d1405e92518..dc89866fc7b7e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -475,19 +475,16 @@ def group_any_all(int8_t[:, ::1] out, # group_add, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- -ctypedef fused add_t: - float64_t - float32_t - complex64_t - complex128_t - object - ctypedef fused mean_t: float64_t float32_t complex64_t complex128_t +ctypedef fused add_t: + mean_t + object + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 6e97c13c644cf..bbc17c4cb5415 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -27,6 +27,7 @@ cnp.import_array() from pandas._libs cimport util +from pandas._libs.dtypes cimport numeric_object_t from pandas._libs.khash cimport ( KHASH_TRACE_DOMAIN, are_equivalent_float32_t, diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index be57d2cd5f844..6f18ea111db50 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -301,19 +301,9 @@ cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): ctypedef fused htfunc_t: + numeric_object_t complex128_t complex64_t - float64_t - float32_t - uint64_t - uint32_t - uint16_t - uint8_t - int64_t - int32_t - int16_t - int8_t - object cpdef value_count(ndarray[htfunc_t] values, bint dropna): diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 6214693f22975..0e5be675f8fcd 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -6,7 +6,7 @@ from pandas.core.arrays.interval import VALID_CLOSED -def pyarrow_array_to_numpy_and_mask(arr, dtype): +def pyarrow_array_to_numpy_and_mask(arr, dtype: np.dtype): """ Convert a primitive pyarrow.Array to a numpy array and boolean mask based on the buffers of the Array. diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 3446d5fc43a65..8b252af2a73f7 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -313,11 +313,12 @@ def fillna( # TODO: check value is None # (for now) when self.ndim == 2, we assume axis=0 func = missing.get_fill_func(method, ndim=self.ndim) - new_values, _ = func(self._ndarray.T.copy(), limit=limit, mask=mask.T) - new_values = new_values.T + npvalues = self._ndarray.T.copy() + func(npvalues, limit=limit, mask=mask.T) + npvalues = npvalues.T # TODO: PandasArray didn't used to copy, need tests for this - new_values = self._from_backing_data(new_values) + new_values = self._from_backing_data(npvalues) else: # fill with value new_values = self.copy() diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7c0daeecafaf7..ac5f38800b510 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -789,8 +789,9 @@ def fillna( if mask.any(): if method is not None: func = missing.get_fill_func(method) - new_values, _ = func(self.astype(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values, dtype=self.dtype) + npvalues = self.astype(object) + func(npvalues, limit=limit, mask=mask) + new_values = self._from_sequence(npvalues, dtype=self.dtype) else: # fill with value new_values = self.copy() @@ -1397,7 +1398,8 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): __hash__: None # type: ignore[assignment] # ------------------------------------------------------------------------ - # Non-Optimized Default Methods + # Non-Optimized Default Methods; in the case of the private methods here, + # these are not guaranteeed to be stable across pandas versions. def tolist(self) -> list: """ @@ -1510,10 +1512,11 @@ def _fill_mask_inplace( ExtensionArray.fillna """ func = missing.get_fill_func(method) + npvalues = self.astype(object) # NB: if we don't copy mask here, it may be altered inplace, which # would mess up the `self[mask] = ...` below. - new_values, _ = func(self.astype(object), limit=limit, mask=mask.copy()) - new_values = self._from_sequence(new_values, dtype=self.dtype) + func(npvalues, limit=limit, mask=mask.copy()) + new_values = self._from_sequence(npvalues, dtype=self.dtype) self[mask] = new_values[mask] return diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 21f44dbc6a1cd..649a9f6e8e081 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -174,12 +174,10 @@ def fillna( if mask.any(): if method is not None: func = missing.get_fill_func(method, ndim=self.ndim) - new_values, new_mask = func( - self._data.copy().T, - limit=limit, - mask=mask.copy().T, - ) - return type(self)(new_values.T, new_mask.view(np.bool_).T) + npvalues = self._data.copy().T + new_mask = mask.copy().T + func(npvalues, limit=limit, mask=new_mask) + return type(self)(npvalues.T, new_mask.T) else: # fill with value new_values = self.copy() diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 958c9f7b0b3f1..4e5f4563f791e 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -95,7 +95,7 @@ def __from_arrow__( results = [] for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype) num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 487864adc03cd..322da6a1e4ece 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1006,7 +1006,7 @@ def __from_arrow__( results = [] for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype(np.int64)) parr = PeriodArray(data.copy(), freq=self.freq, copy=False) parr[~mask] = NaT results.append(parr) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dbff541e9568b..309b6a86be095 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3687,6 +3687,7 @@ def _reindex_output( index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel() if self.as_index: + # Always holds for SeriesGroupBy unless GH#36507 is implemented d = { self.obj._get_axis_name(self.axis): index, "copy": False, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index af60192676597..e71e56574d766 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -680,7 +680,7 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info - uniques = self.grouping_vector.result_arraylike + uniques = self.grouping_vector.result_index._values else: # GH35667, replace dropna=False with na_sentinel=None if not self._dropna: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index cf046d92dd6f3..252ebc075de57 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -868,21 +868,6 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) - @final - @cache_readonly - def result_arraylike(self) -> ArrayLike: - """ - Analogous to result_index, but returning an ndarray/ExtensionArray - allowing us to retain ExtensionDtypes not supported by Index. - """ - # TODO(ExtensionIndex): once Index supports arbitrary EAs, this can - # be removed in favor of result_index - if len(self.groupings) == 1: - return self.groupings[0].group_arraylike - - # result_index is MultiIndex - return self.result_index._values - @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: @@ -1147,7 +1132,7 @@ def reconstructed_codes(self) -> list[np.ndarray]: return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]] @cache_readonly - def result_index(self): + def result_index(self) -> Index: if len(self.binlabels) != 0 and isna(self.binlabels[0]): return self.binlabels[1:] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9488e15920039..da3a717cda55d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -253,6 +253,8 @@ def get_new_values(self, values, fill_value=None): else: if isinstance(dtype, ExtensionDtype): # GH#41875 + # We are assuming that fill_value can be held by this dtype, + # unlike the non-EA case that promotes. cls = dtype.construct_array_type() new_values = cls._empty(result_shape, dtype=dtype) new_values[:] = fill_value diff --git a/pandas/core/series.py b/pandas/core/series.py index efbf8ecf74f1f..29b9428e483de 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5027,7 +5027,7 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit): values._fill_mask_inplace(method, limit, mask) else: fill_f = missing.get_fill_func(method) - values, _ = fill_f(values, limit=limit, mask=mask) + fill_f(values, limit=limit, mask=mask) if inplace: return