Closed
Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Posix Checks are failing on master affecting all pr's. If you take a look at any recent pr's eg #42690 #42705 #42809 and #42701 you can see that they had 24/25 checks successful and it is the same check failing for all of them. Needs discussion
=================================== FAILURES ===================================
___________________________________ test_agg ___________________________________
[gw1] linux -- Python 3.9.5 /usr/share/miniconda/envs/pandas-dev/bin/python
self = <pandas.core.groupby.generic.SeriesGroupBy object at 0x7f1fdee50760>
how = 'group_var', cython_dtype = dtype('float64'), aggregate = True
numeric_only = False, needs_counts = True, needs_values = True, needs_2d = True
needs_nullable = False, min_count = None, needs_mask = False
needs_ngroups = False, result_is_index = False, pre_processing = None
post_processing = <function GroupBy.std.<locals>.<lambda> at 0x7f1fdee52040>
fill_value = None, kwargs = {'ddof': 1}
grouper = <pandas.core.groupby.ops.BinGrouper object at 0x7f1fdee4e430>
ids = array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), _ = array([0, 1, 2, 3, 4])
output = {}
blk_func = <function GroupBy._get_cythonized_result.<locals>.blk_func at 0x7f1fdee52310>
error_msg = 'Cannot cast DatetimeArray to dtype float64', idx = 0
@final
def _get_cythonized_result(
self,
how: str,
cython_dtype: np.dtype,
aggregate: bool = False,
numeric_only: bool | lib.NoDefault = lib.no_default,
needs_counts: bool = False,
needs_values: bool = False,
needs_2d: bool = False,
needs_nullable: bool = False,
min_count: int | None = None,
needs_mask: bool = False,
needs_ngroups: bool = False,
result_is_index: bool = False,
pre_processing=None,
post_processing=None,
fill_value=None,
**kwargs,
):
"""
Get result for Cythonized functions.
Parameters
----------
how : str, Cythonized function name to be called
cython_dtype : np.dtype
Type of the array that will be modified by the Cython call.
aggregate : bool, default False
Whether the result should be aggregated to match the number of
groups
numeric_only : bool, default True
Whether only numeric datatypes should be computed
needs_counts : bool, default False
Whether the counts should be a part of the Cython call
needs_values : bool, default False
Whether the values should be a part of the Cython call
signature
needs_2d : bool, default False
Whether the values and result of the Cython call signature
are 2-dimensional.
min_count : int, default None
When not None, min_count for the Cython call
needs_mask : bool, default False
Whether boolean mask needs to be part of the Cython call
signature
needs_ngroups : bool, default False
Whether number of groups is part of the Cython call signature
needs_nullable : bool, default False
Whether a bool specifying if the input is nullable is part
of the Cython call signature
result_is_index : bool, default False
Whether the result of the Cython operation is an index of
values to be retrieved, instead of the actual values themselves
pre_processing : function, default None
Function to be applied to `values` prior to passing to Cython.
Function should return a tuple where the first element is the
values to be passed to Cython and the second element is an optional
type which the values should be converted to after being returned
by the Cython operation. This function is also responsible for
raising a TypeError if the values have an invalid type. Raises
if `needs_values` is False.
post_processing : function, default None
Function to be applied to result of Cython function. Should accept
an array of values as the first argument and type inferences as its
second argument, i.e. the signature should be
(ndarray, Type). If `needs_nullable=True`, a third argument should be
`nullable`, to allow for processing specific to nullable values.
fill_value : any, default None
The scalar value to use for newly introduced missing values.
**kwargs : dict
Extra arguments to be passed back to Cython funcs
Returns
-------
`Series` or `DataFrame` with filled values
"""
numeric_only = self._resolve_numeric_only(numeric_only)
if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
if post_processing and not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
if pre_processing:
if not callable(pre_processing):
raise ValueError("'pre_processing' must be a callable!")
if not needs_values:
raise ValueError(
"Cannot use 'pre_processing' without specifying 'needs_values'!"
)
grouper = self.grouper
ids, _, ngroups = grouper.group_info
output: dict[base.OutputKey, ArrayLike] = {}
base_func = getattr(libgroupby, how)
base_func = partial(base_func, labels=ids)
if needs_ngroups:
base_func = partial(base_func, ngroups=ngroups)
if min_count is not None:
base_func = partial(base_func, min_count=min_count)
def blk_func(values: ArrayLike) -> ArrayLike:
if aggregate:
result_sz = ngroups
else:
result_sz = len(values)
result: ArrayLike
result = np.zeros(result_sz, dtype=cython_dtype)
if needs_2d:
result = result.reshape((-1, 1))
func = partial(base_func, out=result)
inferences = None
if needs_counts:
counts = np.zeros(self.ngroups, dtype=np.int64)
func = partial(func, counts=counts)
if needs_values:
vals = values
if pre_processing:
vals, inferences = pre_processing(vals)
vals = vals.astype(cython_dtype, copy=False)
if needs_2d:
vals = vals.reshape((-1, 1))
func = partial(func, values=vals)
if needs_mask:
mask = isna(values).view(np.uint8)
func = partial(func, mask=mask)
if needs_nullable:
is_nullable = isinstance(values, BaseMaskedArray)
func = partial(func, nullable=is_nullable)
func(**kwargs) # Call func to modify indexer values in place
if needs_2d:
result = result.reshape(-1)
if result_is_index:
result = algorithms.take_nd(values, result, fill_value=fill_value)
if post_processing:
pp_kwargs = {}
if needs_nullable:
pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
result = post_processing(result, inferences, **pp_kwargs)
return result
error_msg = ""
for idx, obj in enumerate(self._iterate_slices()):
values = obj._values
if numeric_only and not is_numeric_dtype(values.dtype):
continue
try:
> result = blk_func(values)
pandas/core/groupby/groupby.py:2972:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
values = <DatetimeArray>
['2005-01-01 00:00:00', '2005-01-02 00:00:00', '2005-01-03 00:00:00',
'2005-01-04 00:00:00', '2005-01...1-07 00:00:00', '2005-01-08 00:00:00', '2005-01-09 00:00:00',
'2005-01-10 00:00:00']
Length: 10, dtype: datetime64[ns]
def blk_func(values: ArrayLike) -> ArrayLike:
if aggregate:
result_sz = ngroups
else:
result_sz = len(values)
result: ArrayLike
result = np.zeros(result_sz, dtype=cython_dtype)
if needs_2d:
result = result.reshape((-1, 1))
func = partial(base_func, out=result)
inferences = None
if needs_counts:
counts = np.zeros(self.ngroups, dtype=np.int64)
func = partial(func, counts=counts)
if needs_values:
vals = values
if pre_processing:
vals, inferences = pre_processing(vals)
> vals = vals.astype(cython_dtype, copy=False)
pandas/core/groupby/groupby.py:2934:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <DatetimeArray>
['2005-01-01 00:00:00', '2005-01-02 00:00:00', '2005-01-03 00:00:00',
'2005-01-04 00:00:00', '2005-01...1-07 00:00:00', '2005-01-08 00:00:00', '2005-01-09 00:00:00',
'2005-01-10 00:00:00']
Length: 10, dtype: datetime64[ns]
dtype = dtype('float64'), copy = False
def astype(self, dtype, copy: bool = True):
# We handle
# --> datetime
# --> period
# DatetimeLikeArrayMixin Super handles the rest.
dtype = pandas_dtype(dtype)
if is_dtype_equal(dtype, self.dtype):
if copy:
return self.copy()
return self
elif is_datetime64_ns_dtype(dtype):
return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False)
elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype:
# unit conversion e.g. datetime64[s]
return self._ndarray.astype(dtype)
elif is_period_dtype(dtype):
return self.to_period(freq=dtype.freq)
> return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)
pandas/core/arrays/datetimes.py:651:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <DatetimeArray>
['2005-01-01 00:00:00', '2005-01-02 00:00:00', '2005-01-03 00:00:00',
'2005-01-04 00:00:00', '2005-01...1-07 00:00:00', '2005-01-08 00:00:00', '2005-01-09 00:00:00',
'2005-01-10 00:00:00']
Length: 10, dtype: datetime64[ns]
dtype = dtype('float64'), copy = False
def astype(self, dtype, copy: bool = True):
# Some notes on cases we don't have to handle here in the base class:
# 1. PeriodArray.astype handles period -> period
# 2. DatetimeArray.astype handles conversion between tz.
# 3. DatetimeArray.astype handles datetime -> period
dtype = pandas_dtype(dtype)
if is_object_dtype(dtype):
return self._box_values(self.asi8.ravel()).reshape(self.shape)
elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
if is_extension_array_dtype(dtype):
arr_cls = dtype.construct_array_type()
return arr_cls._from_sequence(self, dtype=dtype, copy=copy)
else:
return self._format_native_types()
elif is_integer_dtype(dtype):
# we deliberately ignore int32 vs. int64 here.
# See https://github.com/pandas-dev/pandas/issues/24381 for more.
level = find_stack_level()
warnings.warn(
f"casting {self.dtype} values to int64 with .astype(...) is "
"deprecated and will raise in a future version. "
"Use .view(...) instead.",
FutureWarning,
stacklevel=level,
)
values = self.asi8
if is_unsigned_integer_dtype(dtype):
# Again, we ignore int32 vs. int64
values = values.view("uint64")
if copy:
values = values.copy()
return values
elif (
is_datetime_or_timedelta_dtype(dtype)
and not is_dtype_equal(self.dtype, dtype)
) or is_float_dtype(dtype):
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
> raise TypeError(msg)
E TypeError: Cannot cast DatetimeArray to dtype float64
pandas/core/arrays/datetimelike.py:424: TypeError
During handling of the above exception, another exception occurred:
def test_agg():
# test with all three Resampler apis and TimeGrouper
np.random.seed(1234)
index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
index.name = "date"
df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index)
df_col = df.reset_index()
df_mult = df_col.copy()
df_mult.index = pd.MultiIndex.from_arrays(
[range(10), df.index], names=["index", "date"]
)
r = df.resample("2D")
cases = [
r,
df_col.resample("2D", on="date"),
df_mult.resample("2D", level="date"),
df.groupby(pd.Grouper(freq="2D")),
]
a_mean = r["A"].mean()
a_std = r["A"].std()
a_sum = r["A"].sum()
b_mean = r["B"].mean()
b_std = r["B"].std()
b_sum = r["B"].sum()
expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
for t in cases:
> result = t.aggregate([np.mean, np.std])
pandas/tests/resample/test_resample_api.py:352:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pandas/core/resample.py:331: in aggregate
result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
pandas/core/apply.py:164: in agg
return self.agg_list_like()
pandas/core/apply.py:355: in agg_list_like
new_res = colg.aggregate(arg)
pandas/core/groupby/generic.py:248: in aggregate
ret = self._aggregate_multiple_funcs(func)
pandas/core/groupby/generic.py:302: in _aggregate_multiple_funcs
results[key] = self.aggregate(func)
pandas/core/groupby/generic.py:258: in aggregate
return getattr(self, cyfunc)()
pandas/core/groupby/groupby.py:1728: in std
return self._get_cythonized_result(