From b08af2020b549802888c84a3bed4b7833c835bf3 Mon Sep 17 00:00:00 2001 From: codamuse Date: Tue, 8 Nov 2022 22:04:13 -0500 Subject: [PATCH 01/39] CLN: deleted function --- pandas/core/indexes/multi.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1e255241cf222..41aefd46e0af6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1457,32 +1457,6 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- - @doc(Index._get_grouper_for_level) - def _get_grouper_for_level( - self, - mapper, - *, - level=None, - dropna: bool = True, - ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: - if mapper is not None: - indexer = self.codes[level] - # Handle group mapping function and return - level_values = self.levels[level].take(indexer) - grouper = level_values.map(mapper) - return grouper, None, None - - values = self.get_level_values(level) - codes, uniques = algos.factorize(values, sort=True, use_na_sentinel=dropna) - assert isinstance(uniques, Index) - - if self.levels[level]._can_hold_na: - grouper = uniques.take(codes, fill_value=True) - else: - grouper = uniques.take(codes) - - return grouper, codes, uniques - @cache_readonly def inferred_type(self) -> str: return "mixed" From 57b212b2c722a12aa0cdedc699ec08568ef46332 Mon Sep 17 00:00:00 2001 From: codamuse Date: Wed, 9 Nov 2022 20:35:32 -0500 Subject: [PATCH 02/39] raise notImplementedError --- pandas/core/indexes/multi.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 41aefd46e0af6..4c12da80d04d4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1457,6 +1457,17 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- + @doc(Index._get_grouper_for_level) + def _get_grouper_for_level( + self, + mapper, + *, + level=None, + dropna: bool = True, + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: + raise NotImplementedError + + @cache_readonly def inferred_type(self) -> str: return "mixed" From 42c897da0a27f966750035c58f90006e27ec271e Mon Sep 17 00:00:00 2001 From: codamuse Date: Wed, 9 Nov 2022 20:57:58 -0500 Subject: [PATCH 03/39] check for _is_multi in base class and @final _get_grouper_by_level --- pandas/core/indexes/base.py | 4 ++++ pandas/core/indexes/multi.py | 11 ----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4da8d1a11c607..96fa019cf4d23 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2106,6 +2106,7 @@ def _drop_level_numbers(self, levnums: list[int]): verify_integrity=False, ) + @final def _get_grouper_for_level( self, mapper, @@ -2134,6 +2135,9 @@ def _get_grouper_for_level( uniques : Index or None Index of unique values for level. """ + if self._is_multi: + raise NotImplementedError("Index grouper by level isn't supported for MultiIndex") + assert level is None or level == 0 if mapper is None: grouper = self diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4c12da80d04d4..41aefd46e0af6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1457,17 +1457,6 @@ def _set_names(self, names, *, level=None, validate: bool = True): # -------------------------------------------------------------------- - @doc(Index._get_grouper_for_level) - def _get_grouper_for_level( - self, - mapper, - *, - level=None, - dropna: bool = True, - ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: - raise NotImplementedError - - @cache_readonly def inferred_type(self) -> str: return "mixed" From 1d9f33ef9bfee0c1244938a8381a9403b5e351ab Mon Sep 17 00:00:00 2001 From: codamuse Date: Wed, 9 Nov 2022 21:00:09 -0500 Subject: [PATCH 04/39] tweak NotImplementedError message --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 96fa019cf4d23..950ba800cb589 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2136,7 +2136,7 @@ def _get_grouper_for_level( Index of unique values for level. """ if self._is_multi: - raise NotImplementedError("Index grouper by level isn't supported for MultiIndex") + raise NotImplementedError("Index grouper isn't supported for MultiIndex") assert level is None or level == 0 if mapper is None: From 86773520acaba6857dc0d14e716ba66a9b3ac986 Mon Sep 17 00:00:00 2001 From: Thierry Moisan Date: Wed, 9 Nov 2022 04:08:56 -0500 Subject: [PATCH 05/39] STYLE: fix pylint useless-else-on-loop warnings (#49595) --- pandas/core/internals/array_manager.py | 3 +-- pyproject.toml | 1 - scripts/sync_flake8_versions.py | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f6e50d658a580..d325e5e9b92cc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -697,8 +697,7 @@ def _equal_values(self, other) -> bool: for left, right in zip(self.arrays, other.arrays): if not array_equals(left, right): return False - else: - return True + return True # TODO # to_dict diff --git a/pyproject.toml b/pyproject.toml index 55d20f4e1eaa0..04fd0d173d044 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,6 @@ disable = [ "unused-import", "unused-variable", "using-constant-test", - "useless-else-on-loop", "useless-parent-delegation" ] diff --git a/scripts/sync_flake8_versions.py b/scripts/sync_flake8_versions.py index d19501051349f..8852634c5d796 100644 --- a/scripts/sync_flake8_versions.py +++ b/scripts/sync_flake8_versions.py @@ -55,8 +55,7 @@ def _get_repo_hook(repos: Sequence[Repo], hook_name: str) -> tuple[Repo, YamlMap for hook in repo["hooks"]: if hook["id"] == hook_name: return repo, hook - else: # pragma: no cover - raise RuntimeError(f"Repo with hook {hook_name} not found") + raise RuntimeError(f"Repo with hook {hook_name} not found") # pragma: no cover def _conda_to_pip_compat(dep): From 5cbe002546629302c91a3a2d4d299942cbd81e9a Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:46:40 +0100 Subject: [PATCH 06/39] STYLE enable pylint: chained-comparison (#49586) --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/_ranges.py | 4 ++-- pandas/core/construction.py | 2 +- pandas/plotting/_matplotlib/misc.py | 4 ++-- pandas/tests/io/formats/test_info.py | 2 +- pyproject.toml | 1 - setup.py | 5 +++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7494a8a54f9bb..1e112b61d0e62 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1247,7 +1247,7 @@ def compute(self, method: str) -> Series: inds = inds[:n] findex = nbase else: - if len(inds) < nbase and len(nan_index) + len(inds) >= nbase: + if len(inds) < nbase <= len(nan_index) + len(inds): findex = len(nan_index) + len(inds) else: findex = len(inds) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 3bef3e59d5687..f50ce8a9ea895 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -121,12 +121,12 @@ def _generate_range_overflow_safe( return _generate_range_overflow_safe_signed(endpoint, periods, stride, side) elif (endpoint > 0 and side == "start" and stride > 0) or ( - endpoint < 0 and side == "end" and stride > 0 + endpoint < 0 < stride and side == "end" ): # no chance of not-overflowing raise OutOfBoundsDatetime(msg) - elif side == "end" and endpoint > i64max and endpoint - stride <= i64max: + elif side == "end" and endpoint - stride <= i64max < endpoint: # in _generate_regular_range we added `stride` thereby overflowing # the bounds. Adjust to fix this. return _generate_range_overflow_safe( diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 997611d7860db..8fc77d47554dd 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -666,7 +666,7 @@ def range_to_ndarray(rng: range) -> np.ndarray: arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64") except OverflowError: # GH#30173 handling for ranges that overflow int64 - if (rng.start >= 0 and rng.step > 0) or (rng.stop >= 0 and rng.step < 0): + if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop): try: arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64") except OverflowError: diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 2f482c7d86571..9c054b5f03198 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -201,7 +201,7 @@ def normalize(series): ax.text( xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" ) - elif xy[0] < 0.0 and xy[1] >= 0.0: + elif xy[0] < 0.0 <= xy[1]: ax.text( xy[0] - 0.025, xy[1] + 0.025, @@ -210,7 +210,7 @@ def normalize(series): va="bottom", size="small", ) - elif xy[0] >= 0.0 and xy[1] < 0.0: + elif xy[1] < 0.0 <= xy[0]: ax.text( xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" ) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 54b5e699cd034..33c78baa1eedc 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -116,7 +116,7 @@ def test_info_verbose_check_header_separator_body(): assert len(lines) > 0 for i, line in enumerate(lines): - if i >= start and i < start + size: + if start <= i < start + size: line_nr = f" {i - start} " assert line.startswith(line_nr) diff --git a/pyproject.toml b/pyproject.toml index 04fd0d173d044..9ba6d4d4068c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,6 @@ disable = [ "wrong-import-position", # pylint type "R": refactor, for bad code smell - "chained-comparison", "comparison-with-itself", "consider-merging-isinstance", "consider-using-ternary", diff --git a/setup.py b/setup.py index f0a81779a90b6..05d4407c7e959 100755 --- a/setup.py +++ b/setup.py @@ -355,8 +355,9 @@ def run(self): target_macos_version = "10.9" parsed_macos_version = parse_version(target_macos_version) if ( - parse_version(str(python_target)) < parsed_macos_version - and parse_version(current_system) >= parsed_macos_version + parse_version(str(python_target)) + < parsed_macos_version + <= parse_version(current_system) ): os.environ["MACOSX_DEPLOYMENT_TARGET"] = target_macos_version From 6c0d3fb51e01d92dcb9cff170df785e4e4141edc Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 9 Nov 2022 10:17:09 -0500 Subject: [PATCH 07/39] PERF: df.groupby(categorical) (#49596) * Categorical.reorder_categories perf * whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/arrays/categorical.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d7ecfa0ca6e38..3316989f474d6 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -531,6 +531,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d58089af9a2a8..59d22c1e5275b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1019,7 +1019,10 @@ def reorder_categories(self, new_categories, ordered=None): remove_unused_categories : Remove categories which are not used. set_categories : Set the categories to the specified ones. """ - if set(self.dtype.categories) != set(new_categories): + if ( + len(self.categories) != len(new_categories) + or not self.categories.difference(new_categories).empty + ): raise ValueError( "items in new_categories are not the same as in old categories" ) From bbeef69705ef8097c8ee83934a79fd0cdaeb5d2a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Nov 2022 09:03:03 -0800 Subject: [PATCH 08/39] REF: tighter typing in constructor functions (#49591) --- pandas/core/construction.py | 38 ++++-------- pandas/core/frame.py | 2 +- pandas/core/internals/construction.py | 87 ++++++++++++++------------- 3 files changed, 58 insertions(+), 69 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 8fc77d47554dd..3ae509e74074e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -597,6 +597,15 @@ def sanitize_array( # e.g. test_constructor_floating_data_int_dtype # TODO: where is the discussion that documents the reason for this? subarr = np.array(data, copy=copy) + + elif dtype is None: + subarr = data + if data.dtype == object: + subarr = maybe_infer_to_datetimelike(data) + + if subarr is data and copy: + subarr = subarr.copy() + else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy) @@ -754,7 +763,7 @@ def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike: def _try_cast( arr: list | np.ndarray, - dtype: np.dtype | None, + dtype: np.dtype, copy: bool, ) -> ArrayLike: """ @@ -764,7 +773,7 @@ def _try_cast( ---------- arr : ndarray or list Excludes: ExtensionArray, Series, Index. - dtype : np.dtype or None + dtype : np.dtype copy : bool If False, don't copy the data if not needed. @@ -774,30 +783,7 @@ def _try_cast( """ is_ndarray = isinstance(arr, np.ndarray) - if dtype is None: - # perf shortcut as this is the most common case - if is_ndarray: - arr = cast(np.ndarray, arr) - if arr.dtype != object: - if copy: - return arr.copy() - return arr - - out = maybe_infer_to_datetimelike(arr) - if out is arr and copy: - out = out.copy() - return out - - else: - # i.e. list - varr = np.array(arr, copy=False) - # filter out cases that we _dont_ want to go through - # maybe_infer_to_datetimelike - if varr.dtype != object or varr.size == 0: - return varr - return maybe_infer_to_datetimelike(varr) - - elif is_object_dtype(dtype): + if is_object_dtype(dtype): if not is_ndarray: subarr = construct_1d_object_array_from_listlike(arr) return subarr diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2407317ab86c4..5cf061b98680c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -738,7 +738,7 @@ def __init__( # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): - if not isinstance(data, (abc.Sequence, ExtensionArray)): + if not isinstance(data, abc.Sequence): if hasattr(data, "__array__"): # GH#44616 big perf improvement for e.g. pytorch tensor data = np.asarray(data) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index c1745630602ab..761a641ccb2f7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -53,9 +53,7 @@ ) from pandas.core.arrays import ( Categorical, - DatetimeArray, ExtensionArray, - TimedeltaArray, ) from pandas.core.construction import ( ensure_wrapped_if_datetimelike, @@ -277,14 +275,20 @@ def ndarray_to_mgr( return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) - elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): - # i.e. Datetime64TZ, PeriodDtype + elif is_extension_array_dtype(vdtype): + # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype) + # are already caught above values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) + elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): + # drop subclass info + values = np.array(values, copy=copy_on_sanitize) + values = _ensure_2d(values) + else: # by definition an array here # the dtypes will be coerced to a single dtype @@ -496,51 +500,50 @@ def treat_as_nested(data) -> bool: # --------------------------------------------------------------------- -def _prep_ndarraylike( - values, copy: bool = True -) -> np.ndarray | DatetimeArray | TimedeltaArray: - if isinstance(values, TimedeltaArray) or ( - isinstance(values, DatetimeArray) and values.tz is None - ): - # By retaining DTA/TDA instead of unpacking, we end up retaining non-nano - pass - - elif not isinstance(values, (np.ndarray, ABCSeries, Index)): - if len(values) == 0: - return np.empty((0, 0), dtype=object) - elif isinstance(values, range): - arr = range_to_ndarray(values) - return arr[..., np.newaxis] - - def convert(v): - if not is_list_like(v) or isinstance(v, ABCDataFrame): - return v - - v = extract_array(v, extract_numpy=True) - res = maybe_convert_platform(v) - return res - - # we could have a 1-dim or 2-dim list here - # this is equiv of np.asarray, but does object conversion - # and platform dtype preservation - if is_list_like(values[0]): - values = np.array([convert(v) for v in values]) - elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: - # GH#21861 see test_constructor_list_of_lists - values = np.array([convert(v) for v in values]) - else: - values = convert(values) - +def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray: + # values is specifically _not_ ndarray, EA, Index, or Series + # We only get here with `not treat_as_nested(values)` + + if len(values) == 0: + return np.empty((0, 0), dtype=object) + elif isinstance(values, range): + arr = range_to_ndarray(values) + return arr[..., np.newaxis] + + def convert(v): + if not is_list_like(v) or isinstance(v, ABCDataFrame): + return v + + v = extract_array(v, extract_numpy=True) + res = maybe_convert_platform(v) + # We don't do maybe_infer_to_datetimelike here bc we will end up doing + # it column-by-column in ndarray_to_mgr + return res + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like + # np.asarray would + if is_list_like(values[0]): + values = np.array([convert(v) for v in values]) + elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: + # GH#21861 see test_constructor_list_of_lists + values = np.array([convert(v) for v in values]) else: + values = convert(values) - # drop subclass info - values = np.array(values, copy=copy) + return _ensure_2d(values) + +def _ensure_2d(values: np.ndarray) -> np.ndarray: + """ + Reshape 1D values, raise on anything else other than 2D. + """ if values.ndim == 1: values = values.reshape((values.shape[0], 1)) elif values.ndim != 2: raise ValueError(f"Must pass 2-d input. shape={values.shape}") - return values From 9236857e39ae81451fc5a0bb61e4dfa7857c33f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Nov 2022 09:05:07 -0800 Subject: [PATCH 09/39] CLN: assorted (#49590) --- asv_bench/benchmarks/tslibs/tslib.py | 2 +- pandas/core/arrays/_mixins.py | 14 +------------- pandas/core/arrays/categorical.py | 2 -- pandas/core/arrays/datetimelike.py | 16 +--------------- pandas/core/arrays/datetimes.py | 1 - pandas/core/arrays/period.py | 2 +- pandas/core/dtypes/cast.py | 15 ++++++--------- pandas/core/indexes/base.py | 8 +------- pandas/core/internals/blocks.py | 4 ---- pandas/plotting/_matplotlib/core.py | 6 +----- pandas/tests/apply/test_frame_transform.py | 10 ++++++++++ pandas/tests/apply/test_series_apply.py | 2 ++ pandas/tests/arithmetic/test_datetime64.py | 1 - pandas/tests/arithmetic/test_timedelta64.py | 1 + pandas/tests/arrays/test_datetimes.py | 2 +- pandas/tests/extension/base/dim2.py | 2 +- pandas/tests/extension/test_arrow.py | 3 +++ pandas/tests/frame/methods/test_quantile.py | 3 --- pandas/tests/frame/methods/test_shift.py | 2 -- pandas/tests/generic/test_finalize.py | 3 +++ pandas/tests/groupby/aggregate/test_other.py | 1 + .../tests/indexes/datetimes/methods/test_snap.py | 1 - pandas/tests/indexes/test_any_index.py | 4 ---- pandas/tests/indexes/test_base.py | 2 -- .../tests/series/accessors/test_dt_accessor.py | 1 - pandas/tests/test_downstream.py | 1 - 26 files changed, 34 insertions(+), 75 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index f93ef1cef841f..97ec80201dd16 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -51,7 +51,7 @@ class TimeIntsToPydatetime: _tzs, ) param_names = ["box", "size", "tz"] - # TODO: fold? freq? + # TODO: fold? def setup(self, box, size, tz): if box == "date" and tz is not None: diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 35f1ace7ec351..0a4a550f5d8bc 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -234,21 +234,9 @@ def searchsorted( side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: - # TODO(2.0): use _validate_setitem_value once dt64tz mismatched-timezone - # deprecation is enforced - npvalue = self._validate_searchsorted_value(value) + npvalue = self._validate_setitem_value(value) return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter) - def _validate_searchsorted_value( - self, value: NumpyValueArrayLike | ExtensionArray - ) -> NumpyValueArrayLike: - # TODO(2.0): after deprecation in datetimelikearraymixin is enforced, - # we can remove this and use _validate_setitem_value directly - if isinstance(value, ExtensionArray): - return value.to_numpy() - else: - return value - @doc(ExtensionArray.shift) def shift(self, periods: int = 1, fill_value=None, axis: AxisInt = 0): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 59d22c1e5275b..a9af210e08741 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1304,8 +1304,6 @@ def _validate_setitem_value(self, value): else: return self._validate_scalar(value) - _validate_searchsorted_value = _validate_setitem_value - def _validate_scalar(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f98fbfe429871..b1d9fba22b484 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -600,7 +600,6 @@ def _validate_scalar( value, *, allow_listlike: bool = False, - setitem: bool = True, unbox: bool = True, ): """ @@ -612,8 +611,6 @@ def _validate_scalar( allow_listlike: bool, default False When raising an exception, whether the message should say listlike inputs are allowed. - setitem : bool, default True - Whether to check compatibility with setitem strictness. unbox : bool, default True Whether to unbox the result before returning. Note: unbox=False skips the setitem compatibility check. @@ -735,14 +732,6 @@ def _validate_listlike(self, value, allow_object: bool = False): return value - def _validate_searchsorted_value(self, value): - if not is_list_like(value): - return self._validate_scalar(value, allow_listlike=True, setitem=False) - else: - value = self._validate_listlike(value) - - return self._unbox(value) - def _validate_setitem_value(self, value): if is_list_like(value): value = self._validate_listlike(value) @@ -1363,10 +1352,7 @@ def _addsub_object_array(self, other: np.ndarray, op): # Caller is responsible for broadcasting if necessary assert self.shape == other.shape, (self.shape, other.shape) - with warnings.catch_warnings(): - # filter out warnings about Timestamp.freq - warnings.filterwarnings("ignore", category=FutureWarning) - res_values = op(self.astype("O"), np.asarray(other)) + res_values = op(self.astype("O"), np.asarray(other)) result = pd_array(res_values.ravel()) result = extract_array(result, extract_numpy=True).reshape(self.shape) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 71002377293b7..6d31d0086d84b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -751,7 +751,6 @@ def _add_offset(self, offset) -> DatetimeArray: else: result = DatetimeArray._simple_new(result, dtype=result.dtype) if self.tz is not None: - # FIXME: tz_localize with non-nano result = result.tz_localize(self.tz) return result diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index f7808a729fa0a..548aacda2f8b9 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -692,7 +692,7 @@ def searchsorted( side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: - npvalue = self._validate_searchsorted_value(value).view("M8[ns]") + npvalue = self._validate_setitem_value(value).view("M8[ns]") # Cast to M8 to get datetime-like NaT placement m8arr = self._ndarray.view("M8[ns]") diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d7392911c242b..71b72073c26ca 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1029,15 +1029,12 @@ def soft_convert_objects( if datetime or timedelta: # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. - try: - converted = lib.maybe_convert_objects( - values, - convert_datetime=datetime, - convert_timedelta=timedelta, - convert_period=period, - ) - except (OutOfBoundsDatetime, ValueError): - return values + converted = lib.maybe_convert_objects( + values, + convert_datetime=datetime, + convert_timedelta=timedelta, + convert_period=period, + ) if converted is not values: return converted diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 950ba800cb589..dff5cebb8eacb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3504,13 +3504,7 @@ def _assert_can_do_setop(self, other) -> bool: def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]: if not isinstance(other, Index): - # TODO(2.0): no need to special-case here once _with_infer - # deprecation is enforced - if hasattr(other, "dtype"): - other = Index(other, name=self.name, dtype=other.dtype) - else: - # e.g. list - other = Index(other, name=self.name) + other = Index(other, name=self.name) result_name = self.name else: result_name = get_op_result_name(self, other) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 66071ea4edf52..cab8901ff3596 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1920,15 +1920,11 @@ def _catch_deprecated_value_error(err: Exception) -> None: which will no longer be raised in version.2.0. """ if isinstance(err, ValueError): - # TODO(2.0): once DTA._validate_setitem_value deprecation - # is enforced, stop catching ValueError here altogether if isinstance(err, IncompatibleFrequency): pass elif "'value.closed' is" in str(err): # IntervalDtype mismatched 'closed' pass - elif "Timezones don't match" not in str(err): - raise err class DatetimeLikeBlock(NDArrayBackedExtensionBlock): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 64b5bafa97849..dea5dbd33bbdf 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1180,11 +1180,7 @@ def _plot_colorbar(self, ax: Axes, **kwds): # use the last one which contains the latest information # about the ax img = ax.collections[-1] - with warnings.catch_warnings(): - # https://github.com/matplotlib/matplotlib/issues/23614 - # False positive deprecation warning until matplotlib=3.6 - warnings.filterwarnings("ignore", "Auto-removal of grids") - return self.fig.colorbar(img, ax=ax, **kwds) + return self.fig.colorbar(img, ax=ax, **kwds) class ScatterPlot(PlanePlot): diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 4749cec018fe6..73a52534dd0d2 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -130,6 +130,10 @@ def func(x): frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail] +@pytest.mark.filterwarnings( + "ignore:Calling Series.rank with numeric_only:FutureWarning" +) +@pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1]) def test_transform_bad_dtype(op, frame_or_series, request): # GH 35964 @@ -162,6 +166,12 @@ def test_transform_bad_dtype(op, frame_or_series, request): obj.transform({"A": [op]}) +@pytest.mark.filterwarnings( + "ignore:Dropping of nuisance columns in Series.rank:FutureWarning" +) +@pytest.mark.filterwarnings( + "ignore:Calling Series.rank with numeric_only:FutureWarning" +) @pytest.mark.parametrize("op", frame_kernels_raise) def test_transform_failure_typeerror(request, op): # GH 35964 diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 9b51ea7fef5f8..e0d3510ac3865 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -275,6 +275,8 @@ def test_transform(string_series): tm.assert_series_equal(result.reindex_like(expected), expected) +@pytest.mark.filterwarnings("ignore:Calling Series.rank:FutureWarning") +@pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("op", series_transform_kernels) def test_transform_partial_failure(op, request): # GH 35964 diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 0b1d56a956c07..bad5335ad2d58 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -649,7 +649,6 @@ def test_comparison_tzawareness_compat_scalars(self, comparison_op, box_with_arr # Raising in __eq__ will fallback to NumPy, which warns, fails, # then re-raises the original exception. So we just need to ignore. @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") - @pytest.mark.filterwarnings("ignore:Converting timezone-aware:FutureWarning") def test_scalar_comparison_tzawareness( self, comparison_op, other, tz_aware_fixture, box_with_array ): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index bb7949c9f08e2..14d50acf3eadf 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1730,6 +1730,7 @@ def test_td64arr_floordiv_td64arr_with_nat( result = np.asarray(left) // right tm.assert_equal(result, expected) + @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning") def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831, GH#19125 box = box_with_array diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 37a9c19627ada..564194ed4a9d3 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -81,7 +81,7 @@ def test_non_nano(self, unit, reso, dtype): def test_fields(self, unit, reso, field, dtype, dta_dti): dta, dti = dta_dti - # FIXME: assert (dti == dta).all() + assert (dti == dta).all() res = getattr(dta, field) expected = getattr(dti._data, field) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index 85e19f1860b21..210e566c7e463 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -200,7 +200,7 @@ def test_reductions_2d_axis0(self, data, method): kwargs["ddof"] = 0 try: - if method == "mean" and hasattr(data, "_mask"): + if method in ["mean", "var"] and hasattr(data, "_mask"): # Empty slices produced by the mask cause RuntimeWarnings by numpy with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): result = getattr(arr2d, method)(axis=0, **kwargs) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d44944c74f9d5..d094a7731c417 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -499,6 +499,9 @@ def test_in_numeric_groupby(self, data_for_grouping, request): ) super().test_in_numeric_groupby(data_for_grouping) + @pytest.mark.filterwarnings( + "ignore:The default value of numeric_only:FutureWarning" + ) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 3beb201bcfa05..14b416011b956 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -751,9 +751,6 @@ def test_quantile_empty_no_rows_ints(self, interp_method): exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) - @pytest.mark.filterwarnings( - "ignore:The behavior of DatetimeArray._from_sequence:FutureWarning" - ) def test_quantile_empty_no_rows_dt64(self, interp_method): interpolation, method = interp_method # datetimes diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 3b33d0cc80445..c44c0bd566585 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -564,8 +564,6 @@ def test_shift_dt64values_int_fill_deprecated(self): ], ids=lambda x: str(x.dtype), ) - # TODO(2.0): remove filtering - @pytest.mark.filterwarnings("ignore:Index.ravel.*:FutureWarning") def test_shift_dt64values_axis1_invalid_fill(self, vals, as_cat): # GH#44564 ser = Series(vals) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 689caffe98a2d..6bd9b8af766c3 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -482,6 +482,9 @@ def test_finalize_called_eval_numexpr(): # Binary operations +@pytest.mark.filterwarnings( + "ignore:Automatic reindexing on DataFrame vs Series:FutureWarning" +) @pytest.mark.parametrize("annotate", ["left", "right", "both"]) @pytest.mark.parametrize( "args", diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 6740729d038a7..9aa58e919ce24 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -25,6 +25,7 @@ from pandas.io.formats.printing import pprint_thing +@pytest.mark.filterwarnings("ignore:Dropping invalid columns:FutureWarning") def test_agg_partial_failure_raises(): # GH#43741 diff --git a/pandas/tests/indexes/datetimes/methods/test_snap.py b/pandas/tests/indexes/datetimes/methods/test_snap.py index a94d00d919082..755f6cff1278f 100644 --- a/pandas/tests/indexes/datetimes/methods/test_snap.py +++ b/pandas/tests/indexes/datetimes/methods/test_snap.py @@ -26,7 +26,6 @@ def astype_non_nano(dti_nano, unit): return dti -@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) @pytest.mark.parametrize("name", [None, "my_dti"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 6d4e7caacc5e4..3b3d6dbaf697f 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -133,8 +133,6 @@ def test_slice_keeps_name(self, index): assert index.name == index[1:].name @pytest.mark.parametrize("item", [101, "no_int", 2.5]) - # FutureWarning from non-tuple sequence of nd indexing - @pytest.mark.filterwarnings("ignore::FutureWarning") def test_getitem_error(self, index, item): msg = "|".join( [ @@ -145,8 +143,6 @@ def test_getitem_error(self, index, item): "are valid indices" ), "index out of bounds", # string[pyarrow] - "Only integers, slices and integer or " - "boolean arrays are valid indices.", # string[pyarrow] ] ) with pytest.raises(IndexError, match=msg): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ef041d7f9e119..5b8650dade745 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -230,7 +230,6 @@ def test_constructor_simple_new(self, vals, dtype): result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) - @pytest.mark.filterwarnings("ignore:Passing keywords other:FutureWarning") @pytest.mark.parametrize("attr", ["values", "asi8"]) @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, klass): @@ -1498,7 +1497,6 @@ def test_index_subclass_constructor_wrong_kwargs(index_maker): index_maker(foo="bar") -@pytest.mark.filterwarnings("ignore:Passing keywords other:FutureWarning") def test_deprecated_fastpath(): msg = "[Uu]nexpected keyword argument" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index ccd79d5cc58f4..1e929cd43842b 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -731,7 +731,6 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], ], ) - @pytest.mark.filterwarnings("ignore:Inferring datetime64:FutureWarning") def test_isocalendar(self, input_series, expected_output): result = pd.to_datetime(Series(input_series)).dt.isocalendar() expected_frame = DataFrame( diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index cea9484fbbf80..a7f4269fa62b1 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -156,7 +156,6 @@ def test_oo_optimized_datetime_index_unpickle(): @pytest.mark.network @tm.network # Cython import warning -@pytest.mark.filterwarnings("ignore:pandas.util.testing is deprecated") @pytest.mark.filterwarnings("ignore:can't:ImportWarning") @pytest.mark.filterwarnings("ignore:.*64Index is deprecated:FutureWarning") @pytest.mark.filterwarnings( From 63a935020a193b285e4bb2469319d001ac156dbf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Nov 2022 09:37:45 -0800 Subject: [PATCH 10/39] API: Index(object_dtype_bool_ndarray) retain object dtype (#49594) * API: Index(object_dtype_bool_ndarray) retain object dtype * GH ref, test --- doc/source/whatsnew/v2.0.0.rst | 3 ++- pandas/core/indexes/base.py | 22 +++++----------------- pandas/tests/indexes/test_index_new.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3316989f474d6..c2167c1ba684e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -291,7 +291,8 @@ Other API changes - Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`) - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) -- Changed behavior of :class:`Index` construct with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) +- Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) +- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dff5cebb8eacb..4fbf162e4ae12 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -81,6 +81,7 @@ find_common_type, infer_dtype_from, maybe_cast_pointwise_result, + maybe_infer_to_datetimelike, np_can_hold_element, ) from pandas.core.dtypes.common import ( @@ -503,9 +504,8 @@ def __new__( arr = com.asarray_tuplesafe(data, dtype=_dtype_obj) if dtype is None: - arr = _maybe_cast_data_without_dtype( - arr, cast_numeric_deprecated=True - ) + arr = maybe_infer_to_datetimelike(arr) + arr = ensure_wrapped_if_datetimelike(arr) dtype = arr.dtype klass = cls._dtype_to_subclass(arr.dtype) @@ -534,9 +534,7 @@ def __new__( subarr = com.asarray_tuplesafe(data, dtype=_dtype_obj) if dtype is None: # with e.g. a list [1, 2, 3] casting to numeric is _not_ deprecated - subarr = _maybe_cast_data_without_dtype( - subarr, cast_numeric_deprecated=False - ) + subarr = _maybe_cast_data_without_dtype(subarr) dtype = subarr.dtype return Index(subarr, dtype=dtype, copy=copy, name=name) @@ -7060,9 +7058,7 @@ def maybe_extract_name(name, obj, cls) -> Hashable: return name -def _maybe_cast_data_without_dtype( - subarr: np.ndarray, cast_numeric_deprecated: bool = True -) -> ArrayLike: +def _maybe_cast_data_without_dtype(subarr: npt.NDArray[np.object_]) -> ArrayLike: """ If we have an arraylike input but no passed dtype, try to infer a supported dtype. @@ -7070,8 +7066,6 @@ def _maybe_cast_data_without_dtype( Parameters ---------- subarr : np.ndarray[object] - cast_numeric_deprecated : bool, default True - Whether to issue a FutureWarning when inferring numeric dtypes. Returns ------- @@ -7086,12 +7080,6 @@ def _maybe_cast_data_without_dtype( convert_interval=True, dtype_if_all_nat=np.dtype("datetime64[ns]"), ) - if result.dtype.kind in ["i", "u", "f"]: - if not cast_numeric_deprecated: - # i.e. we started with a list, not an ndarray[object] - return result - return subarr - result = ensure_wrapped_if_datetimelike(result) return result diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b718c33e666d7..4a1333e2b18b4 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -39,6 +39,24 @@ class TestIndexConstructorInference: + def test_object_all_bools(self): + # GH#49594 match Series behavior on ndarray[object] of all bools + arr = np.array([True, False], dtype=object) + res = Index(arr) + assert res.dtype == object + + # since the point is matching Series behavior, let's double check + assert Series(arr).dtype == object + + def test_object_all_complex(self): + # GH#49594 match Series behavior on ndarray[object] of all complex + arr = np.array([complex(1), complex(2)], dtype=object) + res = Index(arr) + assert res.dtype == object + + # since the point is matching Series behavior, let's double check + assert Series(arr).dtype == object + @pytest.mark.parametrize("val", [NaT, None, np.nan, float("nan")]) def test_infer_nat(self, val): # GH#49340 all NaT/None/nan and at least 1 NaT -> datetime64[ns], From 9adaf8c74e2eb9e3445f46039a73bcf39e8f4b6f Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 9 Nov 2022 17:53:56 +0000 Subject: [PATCH 11/39] BUG: Series(index=[]) should have dtype=object (#49574) * BUG: Series(index=[]) should have dtype=object * parametrize tests * accept dtype * fix Co-authored-by: Terji Petersen --- pandas/core/series.py | 5 +++- pandas/tests/series/test_constructors.py | 31 +++++++++++++----------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ee2f1f83b9748..eeaab764a8ad9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -511,7 +511,10 @@ def _init_dict( elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. - values = na_value_for_dtype(pandas_dtype(dtype), compat=False) + if len(index) or dtype is not None: + values = na_value_for_dtype(pandas_dtype(dtype), compat=False) + else: + values = [] keys = index else: keys, values = (), [] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1209d92ffa14a..37348bb743537 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -83,24 +83,27 @@ def test_unparseable_strings_with_dt64_dtype(self): # test for None or an empty generator. # test_constructor_pass_none tests None but only with the index also # passed. - (lambda: Series(), True), - (lambda: Series(None), True), - (lambda: Series({}), True), - (lambda: Series(()), False), # creates a RangeIndex - (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series(_ for _ in []), False), # creates a RangeIndex - (lambda: Series(data=None), True), - (lambda: Series(data={}), True), - (lambda: Series(data=()), False), # creates a RangeIndex - (lambda: Series(data=[]), False), # creates a RangeIndex - (lambda: Series(data=(_ for _ in [])), False), # creates a RangeIndex + (lambda idx: Series(index=idx), True), + (lambda idx: Series(None, index=idx), True), + (lambda idx: Series({}, index=idx), True), + (lambda idx: Series((), index=idx), False), # creates a RangeIndex + (lambda idx: Series([], index=idx), False), # creates a RangeIndex + (lambda idx: Series((_ for _ in []), index=idx), False), # RangeIndex + (lambda idx: Series(data=None, index=idx), True), + (lambda idx: Series(data={}, index=idx), True), + (lambda idx: Series(data=(), index=idx), False), # creates a RangeIndex + (lambda idx: Series(data=[], index=idx), False), # creates a RangeIndex + (lambda idx: Series(data=(_ for _ in []), index=idx), False), # RangeIndex ], ) - def test_empty_constructor(self, constructor, check_index_type): + @pytest.mark.parametrize("empty_index", [None, []]) + def test_empty_constructor(self, constructor, check_index_type, empty_index): # TODO: share with frame test of the same name - expected = Series() - result = constructor() + # GH 49573 (addition of empty_index parameter) + expected = Series(index=empty_index) + result = constructor(empty_index) + assert result.dtype == object assert len(result.index) == 0 tm.assert_series_equal(result, expected, check_index_type=check_index_type) From 8e0aa38c1f5aa6910ade3e56f473ba60443705e8 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 9 Nov 2022 17:58:56 +0000 Subject: [PATCH 12/39] CLN: collect fastpath in Series.__init__ (#49575) * CLN: collect fastpath in Series.__init__ * small clean Co-authored-by: Terji Petersen --- pandas/core/series.py | 171 ++++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 88 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index eeaab764a8ad9..7a768841afb25 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -368,8 +368,7 @@ def __init__( # we are called internally, so short-circuit if fastpath: - - # data is an ndarray, index is defined + # data is a ndarray, index is defined if not isinstance(data, (SingleBlockManager, SingleArrayManager)): manager = get_option("mode.data_manager") if manager == "block": @@ -378,103 +377,99 @@ def __init__( data = SingleArrayManager.from_array(data, index) if copy: data = data.copy() - if index is None: - index = data.index - - else: + # skips validation of the name + object.__setattr__(self, "_name", name) + NDFrame.__init__(self, data) + return - name = ibase.maybe_extract_name(name, data, type(self)) + name = ibase.maybe_extract_name(name, data, type(self)) - if index is not None: - index = ensure_index(index) + if index is not None: + index = ensure_index(index) - if data is None: - data = {} - if dtype is not None: - dtype = self._validate_dtype(dtype) + if data is None: + data = {} + if dtype is not None: + dtype = self._validate_dtype(dtype) - if isinstance(data, MultiIndex): - raise NotImplementedError( - "initializing a Series from a MultiIndex is not supported" - ) - if isinstance(data, Index): - - if dtype is not None: - # astype copies - data = data.astype(dtype) - else: - # GH#24096 we need to ensure the index remains immutable - data = data._values.copy() - copy = False + if isinstance(data, MultiIndex): + raise NotImplementedError( + "initializing a Series from a MultiIndex is not supported" + ) + if isinstance(data, Index): - elif isinstance(data, np.ndarray): - if len(data.dtype): - # GH#13296 we are dealing with a compound dtype, which - # should be treated as 2D - raise ValueError( - "Cannot construct a Series from an ndarray with " - "compound dtype. Use DataFrame instead." - ) - elif isinstance(data, Series): - if index is None: - index = data.index - else: - data = data.reindex(index, copy=copy) - copy = False - data = data._mgr - elif is_dict_like(data): - data, index = self._init_dict(data, index, dtype) - dtype = None - copy = False - elif isinstance(data, (SingleBlockManager, SingleArrayManager)): - if index is None: - index = data.index - elif not data.index.equals(index) or copy: - # GH#19275 SingleBlockManager input should only be called - # internally - raise AssertionError( - "Cannot pass both SingleBlockManager " - "`data` argument and a different " - "`index` argument. `copy` must be False." - ) - - elif isinstance(data, ExtensionArray): - pass + if dtype is not None: + # astype copies + data = data.astype(dtype) else: - data = com.maybe_iterable_to_list(data) - if is_list_like(data) and not len(data) and dtype is None: - # GH 29405: Pre-2.0, this defaulted to float. - dtype = np.dtype(object) - + # GH#24096 we need to ensure the index remains immutable + data = data._values.copy() + copy = False + + elif isinstance(data, np.ndarray): + if len(data.dtype): + # GH#13296 we are dealing with a compound dtype, which + # should be treated as 2D + raise ValueError( + "Cannot construct a Series from an ndarray with " + "compound dtype. Use DataFrame instead." + ) + elif isinstance(data, Series): if index is None: - if not is_list_like(data): - data = [data] - index = default_index(len(data)) - elif is_list_like(data): - com.require_length_match(data, index) - - # create/copy the manager - if isinstance(data, (SingleBlockManager, SingleArrayManager)): - if dtype is not None: - data = data.astype(dtype=dtype, errors="ignore", copy=copy) - elif copy: - data = data.copy() + index = data.index else: - data = sanitize_array(data, index, dtype, copy) + data = data.reindex(index, copy=copy) + copy = False + data = data._mgr + elif is_dict_like(data): + data, index = self._init_dict(data, index, dtype) + dtype = None + copy = False + elif isinstance(data, (SingleBlockManager, SingleArrayManager)): + if index is None: + index = data.index + elif not data.index.equals(index) or copy: + # GH#19275 SingleBlockManager input should only be called + # internally + raise AssertionError( + "Cannot pass both SingleBlockManager " + "`data` argument and a different " + "`index` argument. `copy` must be False." + ) - manager = get_option("mode.data_manager") - if manager == "block": - data = SingleBlockManager.from_array(data, index) - elif manager == "array": - data = SingleArrayManager.from_array(data, index) + elif isinstance(data, ExtensionArray): + pass + else: + data = com.maybe_iterable_to_list(data) + if is_list_like(data) and not len(data) and dtype is None: + # GH 29405: Pre-2.0, this defaulted to float. + dtype = np.dtype(object) + + if index is None: + if not is_list_like(data): + data = [data] + index = default_index(len(data)) + elif is_list_like(data): + com.require_length_match(data, index) + + # create/copy the manager + if isinstance(data, (SingleBlockManager, SingleArrayManager)): + if dtype is not None: + data = data.astype(dtype=dtype, errors="ignore", copy=copy) + elif copy: + data = data.copy() + else: + data = sanitize_array(data, index, dtype, copy) + + manager = get_option("mode.data_manager") + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) NDFrame.__init__(self, data) - if fastpath: - # skips validation of the name - object.__setattr__(self, "_name", name) - else: - self.name = name - self._set_axis(0, index) + self.name = name + self._set_axis(0, index) def _init_dict( self, data, index: Index | None = None, dtype: DtypeObj | None = None From 3d92ac337223560ff61b02dfdf5956552a02a5a0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 9 Nov 2022 13:24:34 -0500 Subject: [PATCH 13/39] BUG/PERF: MultiIndex.value_counts returning flat index (#49558) --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/algorithms.py | 8 ++++++++ pandas/tests/base/test_value_counts.py | 2 -- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c2167c1ba684e..07a4e4af3dbe7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -617,6 +617,7 @@ MultiIndex - Bug in :meth:`MultiIndex.union` not sorting when sort=None and index contains missing values (:issue:`49010`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`) +- Bug in :meth:`MultiIndex.value_counts` returning a :class:`Series` indexed by flat index of tuples instead of a :class:`MultiIndex` (:issue:`49558`) - I/O diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1e112b61d0e62..cc5ff2e756cfa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -871,6 +871,14 @@ def value_counts( result.name = name counts = result._values + elif isinstance(values, ABCMultiIndex): + # GH49558 + levels = list(range(values.nlevels)) + result = Series(index=values).groupby(level=levels, dropna=dropna).size() + # TODO: allow index names to remain (see discussion in GH49497) + result.index.names = [None] * values.nlevels + counts = result._values + else: values = _ensure_arraylike(values) keys, counts = value_counts_arraylike(values, dropna) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index f6ad3f24434d3..dafbd9fee1b8e 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -29,8 +29,6 @@ def test_value_counts(index_or_series_obj): counter = collections.Counter(obj) expected = Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) expected.index = expected.index.astype(obj.dtype) - if isinstance(obj, pd.MultiIndex): - expected.index = Index(expected.index) if not isinstance(result.dtype, np.dtype): # i.e IntegerDtype From a366e83b881d626144448e34ee2c698861baf18c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 9 Nov 2022 12:19:26 -0800 Subject: [PATCH 14/39] CI: Change flaky to_excel test to compare DataFrames (#49509) --- pandas/tests/io/test_common.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 145682b484100..5b3b5602c95bc 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -359,14 +359,18 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): writer = getattr(df, writer_name) writer(string, **writer_kwargs) - with open(string, "rb") as f: - expected = f.read() - writer(mypath, **writer_kwargs) - with open(fspath, "rb") as f: - result = f.read() - - assert result == expected + with open(string, "rb") as f_str, open(fspath, "rb") as f_path: + if writer == "to_excel": + # binary representation of excel contains time creation + # data that causes flaky CI failures + result = pd.read_excel(f_str, **writer_kwargs) + expected = pd.read_excel(f_path, **writer_kwargs) + tm.assert_frame_equal(result, expected) + else: + result = f_str.read() + expected = f_path.read() + assert result == expected @pytest.mark.filterwarnings( # pytables np.object usage "ignore:`np.object` is a deprecated alias:DeprecationWarning" From 5a0fa1480db7e8036681a863681d7b5f2acc0dd7 Mon Sep 17 00:00:00 2001 From: uzzell <72205435+uzzell@users.noreply.github.com> Date: Wed, 9 Nov 2022 15:53:02 -0500 Subject: [PATCH 15/39] STYLE Enable Pylint statement import-self (#49601) Enable Pylint statement import-self --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9ba6d4d4068c6..71b1f44dbff6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,7 +140,6 @@ disable = [ "fixme", "global-statement", "global-variable-not-assigned", - "import-self", "invalid-envvar-default", "invalid-overridden-method", "keyword-arg-before-vararg", From f9dc98b167458dc84a0baeb11e32245d819358b4 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 9 Nov 2022 16:01:54 -0500 Subject: [PATCH 16/39] TST: MultiIndex.get_indexer with na/missing (#48877) --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/indexes/multi/test_indexing.py | 14 +++++++++++++ pandas/tests/indexes/multi/test_isin.py | 9 +++++++++ pandas/tests/indexes/multi/test_join.py | 19 ++++++++++++++++++ pandas/tests/indexes/multi/test_reindex.py | 12 +++++++++++ pandas/tests/indexes/multi/test_setops.py | 20 ++++++++++++++++++- .../indexing/multiindex/test_multiindex.py | 12 +++++++++++ 7 files changed, 86 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 07a4e4af3dbe7..05c76a2836cd2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -607,7 +607,7 @@ Missing MultiIndex ^^^^^^^^^^ -- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`) +- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`29252`, :issue:`37222`, :issue:`38623`, :issue:`42883`, :issue:`43222`, :issue:`46173`, :issue:`48905`) - Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`) - Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 337f91e0f89b4..324c347f2f088 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -933,3 +933,17 @@ def test_get_locs_reordering(keys, expected): result = idx.get_locs(keys) expected = np.array(expected, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_indexer_for_multiindex_with_nans(nulls_fixture): + # GH37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"]) + + result = idx2.get_indexer(idx1) + expected = np.array([-1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + result = idx1.get_indexer(idx2) + expected = np.array([-1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 695458273d16e..bf003019d5387 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -13,6 +13,15 @@ def test_isin_nan(): ) +def test_isin_missing(nulls_fixture): + # GH48905 + mi1 = MultiIndex.from_tuples([(1, nulls_fixture)]) + mi2 = MultiIndex.from_tuples([(1, 1), (1, 2)]) + result = mi2.isin(mi1) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_isin(): values = [("foo", 2), ("bar", 3), ("quux", 4)] diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index e6bec97aedb38..23d5325dde2bb 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + DataFrame, Index, Interval, MultiIndex, @@ -158,3 +159,21 @@ def test_join_overlapping_interval_level(): result = idx_1.join(idx_2, how="outer") tm.assert_index_equal(result, expected) + + +def test_join_multi_with_nan(): + # GH29252 + df1 = DataFrame( + data={"col1": [1.1, 1.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + df2 = DataFrame( + data={"col2": [2.1, 2.2]}, + index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]), + ) + result = df1.join(df2) + expected = DataFrame( + data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 5d124c19cd865..7ef739056ada8 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -159,3 +159,15 @@ def test_reindex_limit_arg_with_multiindex(): match="limit argument only valid if doing pad, backfill or nearest reindexing", ): df.reindex(new_idx, fill_value=0, limit=1) + + +def test_reindex_with_none_in_nested_multiindex(): + # GH42883 + index = MultiIndex.from_tuples([(("a", None), 1), (("b", None), 2)]) + index2 = MultiIndex.from_tuples([(("b", None), 2), (("a", None), 1)]) + df1_dtype = pd.DataFrame([1, 2], index=index) + df2_dtype = pd.DataFrame([2, 1], index=index2) + + result = df1_dtype.reindex_like(df2_dtype) + expected = df2_dtype + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index e05fd68cab59a..d0345861d6778 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import ( CategoricalIndex, + DataFrame, Index, IntervalIndex, MultiIndex, @@ -548,6 +549,15 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture): tm.assert_index_equal(result, expected) +def test_union_with_missing_values_on_both_sides(nulls_fixture): + # GH#38623 + mi1 = MultiIndex.from_arrays([[1, nulls_fixture]]) + mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]]) + result = mi1.union(mi2) + expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]]) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "Float64"]) @pytest.mark.parametrize("sort", [None, False]) def test_union_nan_got_duplicated(dtype, sort): @@ -651,7 +661,6 @@ def test_union_keep_dtype_precision(any_real_numeric_dtype): def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype): # GH#48498 - arr1 = Series([4, pd.NA], dtype=any_numeric_ea_dtype) arr2 = Series([1, pd.NA], dtype=any_numeric_ea_dtype) midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None]) @@ -695,3 +704,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): result = midx.intersection(midx2) expected = MultiIndex.from_arrays([Series([2], dtype=any_numeric_ea_dtype), [1]]) tm.assert_index_equal(result, expected) + + +def test_union_with_na_when_constructing_dataframe(): + # GH43222 + series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),))) + series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) + result = DataFrame([series1, series2]) + expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 157f0de632e18..f5f58e7e818d9 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -216,3 +216,15 @@ def test_multiindex_repeated_keys(self): ], Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])), ) + + def test_multiindex_with_na_missing_key(self): + # GH46173 + df = DataFrame.from_dict( + { + ("foo",): [1, 2, 3], + ("bar",): [5, 6, 7], + (None,): [8, 9, 0], + } + ) + with pytest.raises(KeyError, match="missing_key"): + df[[("missing_key",)]] From f7f0617040978ac474456cf8206359eac807715a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Nov 2022 14:37:21 -0800 Subject: [PATCH 17/39] DEPR: Enforce DataFrame(list_with_categorical) deprecation (#49592) * DEPR: Enforce DataFrame(list_with_categorical) deprecation * Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/internals/construction.py | 30 +--------------- pandas/tests/frame/test_constructors.py | 47 +++++++++---------------- 3 files changed, 19 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 05c76a2836cd2..81858db84be73 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -482,6 +482,7 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`) - Changed behavior of :class:`Index` constructor when given a ``np.ndarray`` with object-dtype containing numeric entries; this now retains object dtype rather than inferring a numeric dtype, consistent with :class:`Series` behavior (:issue:`42870`) - Changed behavior of :meth:`Index.__and__`, :meth:`Index.__or__` and :meth:`Index.__xor__` to behave as logical operations (matching :class:`Series` behavior) instead of aliases for set operations (:issue:`37374`) +- Changed behavior of :class:`DataFrame` constructor when passed a list whose first element is a :class:`Categorical`, this now treats the elements as rows casting to ``object`` dtype, consistent with behavior for other types (:issue:`38845`) - Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`) - Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`) - Changed behavior of :class:`Timestamp` constructor with a ``np.datetime64`` object and a ``tz`` passed to interpret the input as a wall-time as opposed to a UTC time (:issue:`42288`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 761a641ccb2f7..8e186b1f4a034 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,7 +10,6 @@ Hashable, Sequence, ) -import warnings import numpy as np from numpy import ma @@ -22,7 +21,6 @@ Manager, npt, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -51,10 +49,7 @@ algorithms, common as com, ) -from pandas.core.arrays import ( - Categorical, - ExtensionArray, -) +from pandas.core.arrays import ExtensionArray from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -476,9 +471,6 @@ def nested_data_to_arrays( if index is None: if isinstance(data[0], ABCSeries): index = _get_names_from_index(data) - elif isinstance(data[0], Categorical): - # GH#38845 hit in test_constructor_categorical - index = default_index(len(data[0])) else: index = default_index(len(data)) @@ -795,26 +787,6 @@ def to_arrays( return arrays, columns return [], ensure_index([]) - elif isinstance(data[0], Categorical): - # GH#38845 deprecate special case - warnings.warn( - "The behavior of DataFrame([categorical, ...]) is deprecated and " - "in a future version will be changed to match the behavior of " - "DataFrame([any_listlike, ...]). " - "To retain the old behavior, pass as a dictionary " - "DataFrame({col: categorical, ..})", - FutureWarning, - stacklevel=find_stack_level(), - ) - if columns is None: - columns = default_index(len(data)) - elif len(columns) > len(data): - raise ValueError("len(columns) > len(data)") - elif len(columns) < len(data): - # doing this here is akin to a pre-emptive reindex - data = data[: len(columns)] - return data, columns - elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 17a76decce3c7..810b7f6eaf2a6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2220,47 +2220,34 @@ def test_constructor_categorical(self): tm.assert_series_equal(df[0], expected) def test_construct_from_1item_list_of_categorical(self): + # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove + # Categorical special case # ndim != 1 - msg = "will be changed to match the behavior" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = DataFrame([Categorical(list("abc"))]) - expected = DataFrame({0: Series(list("abc"), dtype="category")}) + cat = Categorical(list("abc")) + df = DataFrame([cat]) + expected = DataFrame([cat.astype(object)]) tm.assert_frame_equal(df, expected) def test_construct_from_list_of_categoricals(self): - msg = "will be changed to match the behavior" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) - expected = DataFrame( - { - 0: Series(list("abc"), dtype="category"), - 1: Series(list("abd"), dtype="category"), - }, - columns=[0, 1], - ) + # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove + # Categorical special case + + df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) + expected = DataFrame([["a", "b", "c"], ["a", "b", "d"]]) tm.assert_frame_equal(df, expected) def test_from_nested_listlike_mixed_types(self): + # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove + # Categorical special case # mixed - msg = "will be changed to match the behavior" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = DataFrame([Categorical(list("abc")), list("def")]) - expected = DataFrame( - {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1] - ) + df = DataFrame([Categorical(list("abc")), list("def")]) + expected = DataFrame([["a", "b", "c"], ["d", "e", "f"]]) tm.assert_frame_equal(df, expected) def test_construct_from_listlikes_mismatched_lengths(self): - # invalid (shape) - msg = "|".join( - [ - r"Length of values \(6\) does not match length of index \(3\)", - ] - ) - msg2 = "will be changed to match the behavior" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) + df = DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) + expected = DataFrame([list("abc"), list("abdefg")]) + tm.assert_frame_equal(df, expected) def test_constructor_categorical_series(self): From a93ec96822a4e4c757281ec33825f93127459c13 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 9 Nov 2022 18:01:59 -0500 Subject: [PATCH 18/39] DEPR: Enforce deprecation of numeric_only=None in DataFrame aggregations (#49551) * WIP * DEPR: Enforce deprecation of numeric_only=None in DataFrame aggregations * Partial reverts * numeric_only in generic/series, fixup * cleanup * Remove docs warning * fixups * Fixups --- doc/source/whatsnew/v1.2.0.rst | 19 +- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/frame.py | 47 +-- pandas/core/generic.py | 77 +++-- pandas/core/series.py | 2 +- pandas/tests/apply/test_frame_apply.py | 20 +- pandas/tests/frame/methods/test_quantile.py | 3 +- pandas/tests/frame/test_reductions.py | 301 +++++++++----------- pandas/tests/groupby/test_apply.py | 8 +- pandas/tests/groupby/test_categorical.py | 8 +- pandas/tests/groupby/test_function.py | 41 ++- 11 files changed, 221 insertions(+), 307 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c5f2dbe71cb3c..fc8b59e11e001 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -383,12 +383,17 @@ this pathological behavior (:issue:`37827`): *New behavior*: -.. ipython:: python - :okwarning: +.. code-block:: ipython - df.mean() + In [3]: df.mean() + Out[3]: + A 1.0 + dtype: float64 - df[["A"]].mean() + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 Moreover, DataFrame reductions with ``numeric_only=None`` will now be consistent with their Series counterparts. In particular, for @@ -415,10 +420,10 @@ instead of casting to a NumPy array which may have different semantics (:issue:` *New behavior*: -.. ipython:: python - :okwarning: +.. code-block:: ipython - df.any() + In [5]: df.any() + Out[5]: Series([], dtype: bool) .. _whatsnew_120.api_breaking.python: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 81858db84be73..73a75667b46da 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -501,6 +501,7 @@ Removal of prior version deprecations/changes - Changed behavior of :meth:`Series.__setitem__` with an integer key and a :class:`Float64Index` when the key is not present in the index; previously we treated the key as positional (behaving like ``series.iloc[key] = val``), now we treat it is a label (behaving like ``series.loc[key] = val``), consistent with :meth:`Series.__getitem__`` behavior (:issue:`33469`) - Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`) - Changed behavior of :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` so that ``group_keys`` is respected even if a transformer is detected (:issue:`34998`) +- Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`) - .. --------------------------------------------------------------------------- @@ -570,6 +571,7 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) +- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) - Conversion diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5cf061b98680c..1627a7add25ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -266,9 +266,8 @@ you to specify a location to update with some value.""", } -_numeric_only_doc = """numeric_only : bool or None, default None - Include only float, int, boolean data. If None, will attempt to use - everything, then use only numeric data +_numeric_only_doc = """numeric_only : bool, default False + Include only float, int, boolean data. """ _merge_doc = """ @@ -10489,7 +10488,7 @@ def _reduce( *, axis: Axis = 0, skipna: bool = True, - numeric_only: bool | None = None, + numeric_only: bool = False, filter_type=None, **kwds, ): @@ -10498,7 +10497,6 @@ def _reduce( # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) assert axis in [0, 1] def func(values: np.ndarray): @@ -10524,25 +10522,22 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - numeric_only_bool = com.resolve_numeric_only(numeric_only) - if numeric_only is not None or axis == 0: + if numeric_only or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object # dtypes are unambiguous can be handled with BlockManager.reduce # Case with EAs see GH#35881 df = self - if numeric_only_bool: + if numeric_only: df = _get_data() if axis == 1: df = df.T axis = 0 - ignore_failures = numeric_only is None - # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce - res, _ = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + res, _ = df._mgr.reduce(blk_func, ignore_failures=False) out = df._constructor(res).iloc[0] if out_dtype is not None: out = out.astype(out_dtype) @@ -10559,36 +10554,11 @@ def _get_data() -> DataFrame: return out - assert numeric_only is None + assert not numeric_only and axis == 1 data = self values = data.values - - try: - result = func(values) - - except TypeError: - # e.g. in nanops trying to convert strs to float - - data = _get_data() - labels = data._get_agg_axis(axis) - - values = data.values - with np.errstate(all="ignore"): - result = func(values) - - # columns have been dropped GH#41480 - arg_name = "numeric_only" - if name in ["all", "any"]: - arg_name = "bool_only" - warnings.warn( - "Dropping of nuisance columns in DataFrame reductions " - f"(with '{arg_name}=None') is deprecated; in a future " - "version this will raise TypeError. Select only valid " - "columns before calling the reduction.", - FutureWarning, - stacklevel=find_stack_level(), - ) + result = func(values) if hasattr(result, "dtype"): if filter_type == "bool" and notna(result).all(): @@ -10600,6 +10570,7 @@ def _get_data() -> DataFrame: # try to coerce to the original dtypes item by item if we can pass + labels = self._get_agg_axis(axis) result = self._constructor_sliced(result, index=labels) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ee41d07c52774..d26a11eae9f7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10799,7 +10799,7 @@ def _logical_func( name: str, func, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10814,7 +10814,7 @@ def _logical_func( FutureWarning, stacklevel=find_stack_level(), ) - if bool_only is not None: + if bool_only: raise NotImplementedError( "Option bool_only is not implemented with option level." ) @@ -10833,7 +10833,6 @@ def _logical_func( and len(self._mgr.arrays) > 1 # TODO(EA2D): special-case not needed and all(x.ndim == 2 for x in self._mgr.arrays) - and bool_only is not None and not kwargs ): # Fastpath avoiding potentially expensive transpose @@ -10854,7 +10853,7 @@ def _logical_func( def any( self, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10866,7 +10865,7 @@ def any( def all( self, axis: Axis = 0, - bool_only: bool_t | None = None, + bool_only: bool_t = False, skipna: bool_t = True, level: Level | None = None, **kwargs, @@ -10933,7 +10932,7 @@ def _stat_function_ddof( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) @@ -10961,7 +10960,7 @@ def sem( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -10974,7 +10973,7 @@ def var( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -10987,7 +10986,7 @@ def std( skipna: bool_t = True, level: Level | None = None, ddof: int = 1, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( @@ -11002,7 +11001,7 @@ def _stat_function( axis: Axis | None | lib.NoDefault = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): if name == "median": @@ -11047,7 +11046,7 @@ def min( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): return self._stat_function( @@ -11065,7 +11064,7 @@ def max( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ): return self._stat_function( @@ -11083,7 +11082,7 @@ def mean( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11095,7 +11094,7 @@ def median( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11107,7 +11106,7 @@ def skew( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11119,7 +11118,7 @@ def kurt( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( @@ -11136,7 +11135,7 @@ def _min_count_stat_function( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11182,7 +11181,7 @@ def sum( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11195,7 +11194,7 @@ def prod( axis: Axis | None = None, skipna: bool_t = True, level: Level | None = None, - numeric_only: bool_t | None = None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11288,7 +11287,7 @@ def sem( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11311,7 +11310,7 @@ def var( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11335,7 +11334,7 @@ def std( skipna: bool_t = True, level=None, ddof: int = 1, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) @@ -11423,7 +11422,7 @@ def sum( axis: Axis | None = None, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11448,7 +11447,7 @@ def prod( axis: Axis | None = None, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): @@ -11474,7 +11473,7 @@ def mean( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) @@ -11496,7 +11495,7 @@ def skew( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) @@ -11521,7 +11520,7 @@ def kurt( axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) @@ -11544,7 +11543,7 @@ def median( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) @@ -11568,7 +11567,7 @@ def max( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) @@ -11592,7 +11591,7 @@ def min( axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, level=None, - numeric_only=None, + numeric_only: bool_t = False, **kwargs, ): return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) @@ -11827,13 +11826,8 @@ def _doc_params(cls): .. deprecated:: 1.3.0 The level keyword is deprecated. Use groupby instead. -numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. Not implemented for Series. - - .. deprecated:: 1.5.0 - Specifying ``numeric_only=None`` is deprecated. The default value will be - ``False`` in a future version of pandas. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. {min_count}\ **kwargs @@ -11865,13 +11859,8 @@ def _doc_params(cls): ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. -numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. Not implemented for Series. - - .. deprecated:: 1.5.0 - Specifying ``numeric_only=None`` is deprecated. The default value will be - ``False`` in a future version of pandas. +numeric_only : bool, default False + Include only float, int, boolean columns. Not implemented for Series. Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 7a768841afb25..2664988a7b8d4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4592,7 +4592,7 @@ def _reduce( *, axis: Axis = 0, skipna: bool = True, - numeric_only=None, + numeric_only: bool = False, filter_type=None, **kwds, ): diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 068ce32b5e7aa..28c776d0a6d35 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1283,8 +1283,11 @@ def test_nuiscance_columns(): ) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.agg("sum") + msg = "DataFrame constructor called with incompatible data and dtype" + with pytest.raises(TypeError, match=msg): + df.agg("sum") + + result = df[["A", "B", "C"]].agg("sum") expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) tm.assert_series_equal(result, expected) @@ -1428,13 +1431,14 @@ def test_apply_datetime_tz_issue(): @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})]) @pytest.mark.parametrize("method", ["min", "max", "sum"]) -def test_consistency_of_aggregates_of_columns_with_missing_values(df, method): +def test_mixed_column_raises(df, method): # GH 16832 - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - none_in_first_column_result = getattr(df[["A", "B"]], method)() - none_in_second_column_result = getattr(df[["B", "A"]], method)() - - tm.assert_series_equal(none_in_first_column_result, none_in_second_column_result) + if method == "sum": + msg = r'can only concatenate str \(not "int"\) to str' + else: + msg = "not supported between instances of 'str' and 'float'" + with pytest.raises(TypeError, match=msg): + getattr(df, method)() @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 14b416011b956..b4661a92c8275 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -139,8 +139,7 @@ def test_non_numeric_exclusion(self, interp_method, request, using_array_manager rs = df.quantile( 0.5, numeric_only=True, interpolation=interpolation, method=method ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - xp = df.median().rename(0.5) + xp = df.median(numeric_only=True).rename(0.5) if interpolation == "nearest": xp = (xp + 0.5).astype(np.int64) if method == "table" and using_array_manager: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 8d4d705296f35..0e5c6057b9a61 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -169,15 +169,23 @@ class TestDataFrameAnalytics: ], ) def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if opname in ["sum", "min", "max"] and axis == 0: - warn = None - elif opname not in ["count", "nunique"]: - warn = FutureWarning - else: - warn = None - msg = "nuisance columns|default value of numeric_only" - with tm.assert_produces_warning(warn, match=msg): + if (opname in ("sum", "min", "max") and axis == 0) or opname in ( + "count", + "nunique", + ): getattr(float_string_frame, opname)(axis=axis) + else: + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + "unsupported operand type", + "not supported between instances of", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": getattr(float_string_frame, opname)(axis=axis, numeric_only=True) @@ -323,9 +331,7 @@ def test_stat_operators_attempt_obj_array(self, method, df): assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) - - if method in ["sum", "prod"]: - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): @@ -337,18 +343,26 @@ def test_mixed_ops(self, op): "str": ["a", "b", "c", "d"], } ) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, op)() - assert len(result) == 2 + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, op)() with pd.option_context("use_bottleneck", False): - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, op)() - assert len(result) == 2 + msg = "|".join( + [ + "Could not convert", + "could not convert", + "can't multiply sequence by non-int", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, op)() def test_reduce_mixed_frame(self): # GH 6806 @@ -416,10 +430,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() + with pytest.raises(TypeError, match="unsupported operand type"): + df.mean() + result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) @@ -648,9 +661,8 @@ def test_operators_timedelta64(self): ) tm.assert_series_equal(result, expected) - # excludes numeric - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = mixed.min(axis=1) + # excludes non-numeric + result = mixed.min(axis=1, numeric_only=True) expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) @@ -819,25 +831,17 @@ def test_sum_mixed_datetime(self): df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex( [2, 3, 4] ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.sum() - - expected = Series({"B": 7.0}) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction 'sum'"): + df.sum() def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - the_mean = float_string_frame.mean(axis=0) - the_sum = float_string_frame.sum(axis=0, numeric_only=True) - tm.assert_index_equal(the_sum.index, the_mean.index) - assert len(the_mean.index) < len(float_string_frame.columns) + with pytest.raises(TypeError, match="Could not convert"): + float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - the_mean = float_string_frame.mean(axis=1) - the_sum = float_string_frame.sum(axis=1, numeric_only=True) - tm.assert_index_equal(the_sum.index, the_mean.index) + with pytest.raises(TypeError, match="unsupported operand type"): + float_string_frame.mean(axis=1) # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 @@ -861,10 +865,8 @@ def test_mean_datetimelike(self): expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.mean() - expected = Series({"A": 1.0, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"): + df.mean() def test_mean_datetimelike_numeric_only_false(self): df = DataFrame( @@ -895,13 +897,13 @@ def test_mean_extensionarray_numeric_only_true(self): tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): - # don't blow up - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): + with pytest.raises(TypeError, match="could not convert"): float_string_frame.std(1) + with pytest.raises(TypeError, match="could not convert"): float_string_frame.var(1) + with pytest.raises(TypeError, match="unsupported operand type"): float_string_frame.mean(1) + with pytest.raises(TypeError, match="could not convert"): float_string_frame.skew(1) def test_sum_bools(self): @@ -1250,24 +1252,26 @@ def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) - warn = None if any(is_categorical_dtype(x) for x in data.dtypes): - warn = FutureWarning + with pytest.raises( + TypeError, match="dtype category does not support reduction" + ): + func(data) - with tm.assert_produces_warning( - warn, match="Select only valid columns", check_stacklevel=False - ): + # method version + with pytest.raises( + TypeError, match="dtype category does not support reduction" + ): + getattr(DataFrame(data), func.__name__)(axis=None) + else: result = func(data) - assert isinstance(result, np.bool_) - assert result.item() is expected + assert isinstance(result, np.bool_) + assert result.item() is expected - # method version - with tm.assert_produces_warning( - warn, match="Select only valid columns", check_stacklevel=False - ): + # method version result = getattr(DataFrame(data), func.__name__)(axis=None) - assert isinstance(result, np.bool_) - assert result.item() is expected + assert isinstance(result, np.bool_) + assert result.item() is expected def test_any_all_object(self): # GH 19976 @@ -1512,20 +1516,11 @@ def test_any_all_categorical_dtype_nuisance_column(self, method): with pytest.raises(TypeError, match="does not support reduction"): getattr(df, method)(bool_only=False) - # With bool_only=None, operating on this column raises and is ignored, - # so we expect an empty result. - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)(bool_only=None) - expected = Series([], index=Index([]), dtype=bool) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + getattr(df, method)(bool_only=None) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df, axis=0) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + getattr(np, method)(df, axis=0) def test_median_categorical_dtype_nuisance_column(self): # GH#21020 DataFrame.median should match Series.median @@ -1539,12 +1534,8 @@ def test_median_categorical_dtype_nuisance_column(self): with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.median() - expected = Series([], index=Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.median() # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(int) @@ -1552,12 +1543,8 @@ def test_median_categorical_dtype_nuisance_column(self): with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.median() - expected = Series([2.0], index=["B"]) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.median() # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead # of expected.values @@ -1579,58 +1566,19 @@ def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): with pytest.raises(TypeError, match="is not ordered for operation"): getattr(df, method)(numeric_only=False) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)() - expected = Series([], index=Index([]), dtype=np.float64) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)() - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(df) # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(object) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = getattr(df, method)() - if method == "min": - expected = Series(["a"], index=["B"]) - else: - expected = Series(["c"], index=["B"]) - tm.assert_series_equal(result, expected) - - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns", check_stacklevel=False - ): - result = getattr(np, method)(df) - tm.assert_series_equal(result, expected) - - def test_reduction_object_block_splits_nuisance_columns(self): - # GH#37827 - df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) - - # We should only exclude "B", not "A" - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() - expected = Series([1.0], index=["A"]) - tm.assert_series_equal(result, expected) - - # Same behavior but heterogeneous dtype - df["C"] = df["A"].astype(int) + 4 + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)() - with tm.assert_produces_warning( - FutureWarning, match="Select only valid columns" - ): - result = df.mean() - expected = Series([1.0, 5.0], index=["A", "C"]) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(df) def test_sum_timedelta64_skipna_false(using_array_manager, request): @@ -1710,12 +1658,8 @@ def test_groupby_regular_arithmetic_equivalent(meth): def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = df.sum() - expected = Series([1, 1.1, "foo"], index=list("abc")) - tm.assert_series_equal(result, expected) + with pytest.raises(TypeError, match="does not support reduction"): + df.sum() def test_prod_sum_min_count_mixed_object(): @@ -1755,18 +1699,46 @@ def test_reduction_axis_none_deprecation(method): "corrwith", "count", "cov", + "mode", + "quantile", + ], +) +def test_numeric_only_deprecation(kernel): + # GH#46852 + df = DataFrame({"a": [1, 2, 3], "b": object}) + args = (df,) if kernel == "corrwith" else () + signature = inspect.signature(getattr(DataFrame, kernel)) + default = signature.parameters["numeric_only"].default + assert default is not True + + if default is None or default is lib.no_default: + expected = getattr(df[["a"]], kernel)(*args) + warn = FutureWarning + else: + # default must be False and works on any nuisance columns + expected = getattr(df, kernel)(*args) + if kernel == "mode": + assert "b" in expected.columns + else: + assert "b" in expected.index + warn = None + msg = f"The default value of numeric_only in DataFrame.{kernel}" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, kernel)(*args) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "kernel", + [ "idxmax", "idxmin", "kurt", - "kurt", "max", "mean", "median", "min", - "mode", - "prod", "prod", - "quantile", "sem", "skew", "std", @@ -1774,32 +1746,17 @@ def test_reduction_axis_none_deprecation(method): "var", ], ) -def test_numeric_only_deprecation(kernel): +def test_fails_on_non_numeric(kernel): # GH#46852 df = DataFrame({"a": [1, 2, 3], "b": object}) - args = (df,) if kernel == "corrwith" else () - signature = inspect.signature(getattr(DataFrame, kernel)) - default = signature.parameters["numeric_only"].default - assert default is not True - - if kernel in ("idxmax", "idxmin"): - # kernels that default to numeric_only=False and fail on nuisance columns - assert default is False - with pytest.raises(TypeError, match="not allowed for this dtype"): - getattr(df, kernel)(*args) - else: - if default is None or default is lib.no_default: - expected = getattr(df[["a"]], kernel)(*args) - warn = FutureWarning - else: - # default must be False and works on any nuisance columns - expected = getattr(df, kernel)(*args) - if kernel == "mode": - assert "b" in expected.columns - else: - assert "b" in expected.index - warn = None - msg = f"The default value of numeric_only in DataFrame.{kernel}" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(df, kernel)(*args) - tm.assert_equal(result, expected) + msg = "|".join( + [ + "not allowed for this dtype", + "argument must be a string or a number", + "not supported between instances of", + "unsupported operand type", + "argument must be a string or a real number", + ] + ) + with pytest.raises(TypeError, match=msg): + getattr(df, kernel)() diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 6f723e1be6fc6..fa51a291bf2f7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -974,17 +974,15 @@ def test_apply_function_index_return(function): def test_apply_function_with_indexing_return_column(): - # GH: 7002 + # GH#7002, GH#41480 df = DataFrame( { "foo1": ["one", "two", "two", "three", "one", "two"], "foo2": [1, 2, 4, 4, 5, 6], } ) - with tm.assert_produces_warning(FutureWarning, match="Select only valid"): - result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) - expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]}) - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="Could not convert"): + df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f8c7cdf658ebf..8fe1dc010211a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -294,13 +294,7 @@ def test_apply(ordered): idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) - # GH#21636 tracking down the xfail, in some builds np.mean(df.loc[[0]]) - # is coming back as Series([0., 1., 0.], index=["missing", "dense", "values"]) - # when we expect Series(0., index=["values"]) - with tm.assert_produces_warning( - FutureWarning, match="Select only valid", check_stacklevel=False - ): - result = grouped.apply(lambda x: np.mean(x)) + result = grouped.apply(lambda x: np.mean(x)) tm.assert_frame_equal(result, expected) result = grouped.mean() diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7a9d540ae08c4..5383a4d28c8ce 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -301,27 +301,19 @@ def gni(self, df): return gni # TODO: non-unique columns, as_index=False - def test_idxmax(self, gb): - # object dtype so idxmax goes through _aggregate_item_by_item - # GH#5610 - # non-cython calls should not include the grouper + def test_idxmax_nuisance_raises(self, gb): + # GH#5610, GH#41480 expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - msg = "The default value of numeric_only in DataFrameGroupBy.idxmax" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.idxmax() - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="not allowed for this dtype"): + gb.idxmax() - def test_idxmin(self, gb): - # object dtype so idxmax goes through _aggregate_item_by_item - # GH#5610 - # non-cython calls should not include the grouper + def test_idxmin_nuisance_raises(self, gb): + # GH#5610, GH#41480 expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" - msg = "The default value of numeric_only in DataFrameGroupBy.idxmin" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = gb.idxmin() - tm.assert_frame_equal(result, expected) + with pytest.raises(TypeError, match="not allowed for this dtype"): + gb.idxmin() def test_describe(self, df, gb, gni): # describe @@ -1382,11 +1374,15 @@ def test_deprecate_numeric_only( gb = df.groupby(keys) method = getattr(gb, kernel) - if has_arg and ( - # Cases where b does not appear in the result - numeric_only is True - or (numeric_only is lib.no_default and numeric_only_default) - or drops_nuisance + if ( + has_arg + and (kernel not in ("idxmax", "idxmin") or numeric_only is True) + and ( + # Cases where b does not appear in the result + numeric_only is True + or (numeric_only is lib.no_default and numeric_only_default) + or drops_nuisance + ) ): if numeric_only is True or (not numeric_only_default and not drops_nuisance): warn = None @@ -1411,9 +1407,8 @@ def test_deprecate_numeric_only( ): result = method(*args, **kwargs) assert "b" in result.columns - elif has_arg: + elif has_arg or kernel in ("idxmax", "idxmin"): assert numeric_only is not True - assert numeric_only is not lib.no_default or numeric_only_default is False assert not drops_nuisance # kernels that are successful on any dtype were above; this will fail msg = ( From 85143627eb684fac3c5c775ed1f6db2747676eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Meirelles?= Date: Wed, 9 Nov 2022 21:59:26 -0300 Subject: [PATCH 19/39] add/timedeltas-seconds-documentation (#49584) * add/timedeltas-seconds-documentation * Fix line lenght * fixing whitespace * fixing whitespace * fixing line lenght * Adding suggestions * Fixing single line error --- pandas/_libs/tslibs/timedeltas.pyx | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 3f3e7c0cb441f..83b4f34bfb70b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1035,6 +1035,34 @@ cdef class _Timedelta(timedelta): @property def seconds(self) -> int: # TODO(cython3): make cdef property + """ + Return the total hours, minutes, and seconds of the timedelta as seconds. + + Timedelta.seconds = hours * 3600 + minutes * 60 + seconds. + + Returns + ------- + int + Number of seconds. + + See Also + -------- + Timedelta.components : Return all attributes with assigned values + (i.e. days, hours, minutes, seconds, milliseconds, microseconds, + nanoseconds). + + Examples + -------- + **Using string input** + >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') + >>> td.seconds + 120 + + **Using integer input** + >>> td = pd.Timedelta(42, unit='s') + >>> td.seconds + 42 + """ # NB: using the python C-API PyDateTime_DELTA_GET_SECONDS will fail # (or be incorrect) self._ensure_components() From 2088f0ef24cd5f179fa050321fe46f6928715b13 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 9 Nov 2022 17:32:20 -0800 Subject: [PATCH 20/39] REF: remove infer_datetimelike_array (#49608) --- pandas/_libs/lib.pyi | 3 - pandas/_libs/lib.pyx | 93 --------------------------- pandas/io/parsers/base_parser.py | 2 +- pandas/tests/dtypes/test_inference.py | 73 --------------------- 4 files changed, 1 insertion(+), 170 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index c51492c92f44c..188494c7c60db 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -169,9 +169,6 @@ def ensure_string_array( copy: bool = ..., skipna: bool = ..., ) -> npt.NDArray[np.object_]: ... -def infer_datetimelike_array( - arr: npt.NDArray[np.object_], -) -> str: ... def convert_nans_to_NA( arr: npt.NDArray[np.object_], ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7880709418adc..3769bbf087fee 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1565,99 +1565,6 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "mixed" -def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: - """ - Infer if we have a datetime or timedelta array. - - date: we have *only* date and maybe strings, nulls - - datetime: we have *only* datetimes and maybe strings, nulls - - timedelta: we have *only* timedeltas and maybe strings, nulls - - nat: we do not have *any* date, datetimes or timedeltas, but do have - at least a NaT - - mixed: other objects (strings, a mix of tz-aware and tz-naive, or - actual objects) - - Parameters - ---------- - arr : ndarray[object] - - Returns - ------- - str: {datetime, timedelta, date, nat, mixed} - """ - cdef: - Py_ssize_t i, n = len(arr) - bint seen_timedelta = False, seen_date = False, seen_datetime = False - bint seen_tz_aware = False, seen_tz_naive = False - bint seen_nat = False - bint seen_period = False, seen_interval = False - object v - - for i in range(n): - v = arr[i] - if isinstance(v, str): - return "mixed" - - elif v is None or util.is_nan(v): - # nan or None - pass - elif v is NaT: - seen_nat = True - elif PyDateTime_Check(v): - # datetime - seen_datetime = True - - # disambiguate between tz-naive and tz-aware - if v.tzinfo is None: - seen_tz_naive = True - else: - seen_tz_aware = True - - if seen_tz_naive and seen_tz_aware: - return "mixed" - elif util.is_datetime64_object(v): - # np.datetime64 - seen_datetime = True - elif PyDate_Check(v): - seen_date = True - elif is_timedelta(v): - # timedelta, or timedelta64 - seen_timedelta = True - elif is_period_object(v): - seen_period = True - break - elif is_interval(v): - seen_interval = True - break - else: - return "mixed" - - if seen_period: - if is_period_array(arr): - return "period" - return "mixed" - - if seen_interval: - if is_interval_array(arr): - return "interval" - return "mixed" - - if seen_date: - if not seen_datetime and not seen_timedelta: - return "date" - return "mixed" - - elif seen_datetime and not seen_timedelta: - return "datetime" - elif seen_timedelta and not seen_datetime: - return "timedelta" - elif seen_datetime and seen_timedelta: - return "mixed" - elif seen_nat: - return "nat" - - return "mixed" - - cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index dd4e801af5894..a5d762a280566 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -771,7 +771,7 @@ def _infer_types( result = BooleanArray(result, bool_mask) elif result.dtype == np.object_ and use_nullable_dtypes: # read_excel sends array of datetime objects - inferred_type = lib.infer_datetimelike_array(result) + inferred_type = lib.infer_dtype(result) if inferred_type != "datetime": result = StringDtype().construct_array_type()._from_sequence(values) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 50fe8379ffa06..df2afad51abf8 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1354,79 +1354,6 @@ def test_infer_dtype_period_with_na(self, na_value): arr = np.array([na_value, Period("2011-01", freq="D"), na_value]) assert lib.infer_dtype(arr, skipna=True) == "period" - @pytest.mark.parametrize( - "data", - [ - [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], - [Timestamp("20170612"), Timestamp("20170311")], - [ - Timestamp("20170612", tz="US/Eastern"), - Timestamp("20170311", tz="US/Eastern"), - ], - [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], - [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)], - ], - ) - def test_infer_datetimelike_array_datetime(self, data): - assert lib.infer_datetimelike_array(data) == "datetime" - - def test_infer_datetimelike_array_date_mixed(self): - # GH49341 pre-2.0 we these were inferred as "datetime" and "timedelta", - # respectively - data = [date(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")] - assert lib.infer_datetimelike_array(data) == "mixed" - - data = ([timedelta(2017, 6, 12), date(2017, 3, 11)],) - assert lib.infer_datetimelike_array(data) == "mixed" - - @pytest.mark.parametrize( - "data", - [ - [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], - [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], - [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)], - ], - ) - def test_infer_datetimelike_array_timedelta(self, data): - assert lib.infer_datetimelike_array(data) == "timedelta" - - def test_infer_datetimelike_array_date(self): - arr = [date(2017, 6, 12), date(2017, 3, 11)] - assert lib.infer_datetimelike_array(arr) == "date" - - @pytest.mark.parametrize( - "data", - [ - ["2017-06-12", "2017-03-11"], - [20170612, 20170311], - [20170612.5, 20170311.8], - [Dummy(), Dummy()], - [Timestamp("20170612"), Timestamp("20170311", tz="US/Eastern")], - [Timestamp("20170612"), 20170311], - [timedelta(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], - ], - ) - def test_infer_datetimelike_array_mixed(self, data): - assert lib.infer_datetimelike_array(data) == "mixed" - - @pytest.mark.parametrize( - "first, expected", - [ - [[None], "mixed"], - [[np.nan], "mixed"], - [[pd.NaT], "nat"], - [[datetime(2017, 6, 12, 19, 30), pd.NaT], "datetime"], - [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], - [[date(2017, 6, 12), pd.NaT], "date"], - [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], - [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"], - ], - ) - @pytest.mark.parametrize("second", [None, np.nan]) - def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): - first.append(second) - assert lib.infer_datetimelike_array(first) == expected - def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr, skipna=True) == "floating" From 22497eff227e45df68b6fd65dfc1483a0baf3293 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Nov 2022 09:51:32 -0800 Subject: [PATCH 21/39] DEPR: Enforce deprecations in indexes/datetimes.py (#49607) * DEPR: date_range(closed) * Disallow partial slicing of missing * Review + failed test --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/indexes/datetimes.py | 61 +++---------------- pandas/tests/frame/indexing/test_getitem.py | 13 ++-- .../indexes/datetimes/test_partial_slicing.py | 19 +++--- pandas/tests/indexing/test_loc.py | 4 +- pandas/tests/series/indexing/test_datetime.py | 2 +- 6 files changed, 28 insertions(+), 73 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 73a75667b46da..2ae50cfe4e137 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -433,9 +433,11 @@ Removal of prior version deprecations/changes - Remove :meth:`DataFrameGroupBy.pad` and :meth:`DataFrameGroupBy.backfill` (:issue:`45076`) - Remove ``numpy`` argument from :func:`read_json` (:issue:`30636`) - Disallow passing abbreviations for ``orient`` in :meth:`DataFrame.to_dict` (:issue:`32516`) +- Disallow partial slicing on an non-monotonic :class:`DatetimeIndex` with keys which are not in Index. This now raises a ``KeyError`` (:issue:`18531`) - Removed ``get_offset`` in favor of :func:`to_offset` (:issue:`30340`) - Removed the ``warn`` keyword in :func:`infer_freq` (:issue:`45947`) - Removed the ``include_start`` and ``include_end`` arguments in :meth:`DataFrame.between_time` in favor of ``inclusive`` (:issue:`43248`) +- Removed the ``closed`` argument in :meth:`date_range` and :meth:`bdate_range` in favor of ``inclusive`` argument (:issue:`40245`) - Removed the ``center`` keyword in :meth:`DataFrame.expanding` (:issue:`20647`) - Removed the ``truediv`` keyword from :func:`eval` (:issue:`29812`) - Removed the ``pandas.datetime`` submodule (:issue:`30489`) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6fcad23e4b4c3..3a00301fbc042 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Hashable, - Literal, ) import warnings @@ -37,7 +36,6 @@ DtypeObj, Frequency, IntervalClosedType, - IntervalLeftRight, TimeAmbiguous, TimeNonexistent, npt, @@ -46,7 +44,6 @@ cache_readonly, doc, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_datetime64_dtype, @@ -699,24 +696,21 @@ def check_str_or_none(point) -> bool: return Index.slice_indexer(self, start, end, step) mask = np.array(True) - deprecation_mask = np.array(True) + raise_mask = np.array(True) if start is not None: start_casted = self._maybe_cast_slice_bound(start, "left") mask = start_casted <= self - deprecation_mask = start_casted == self + raise_mask = start_casted == self if end is not None: end_casted = self._maybe_cast_slice_bound(end, "right") mask = (self <= end_casted) & mask - deprecation_mask = (end_casted == self) | deprecation_mask + raise_mask = (end_casted == self) | raise_mask - if not deprecation_mask.any(): - warnings.warn( + if not raise_mask.any(): + raise KeyError( "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is deprecated and will raise a " - "KeyError in a future Version.", - FutureWarning, - stacklevel=find_stack_level(), + "with non-existing keys is not allowed.", ) indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): @@ -829,8 +823,7 @@ def date_range( tz=None, normalize: bool = False, name: Hashable = None, - closed: Literal["left", "right"] | None | lib.NoDefault = lib.no_default, - inclusive: IntervalClosedType | None = None, + inclusive: IntervalClosedType = "both", **kwargs, ) -> DatetimeIndex: """ @@ -865,13 +858,6 @@ def date_range( Normalize start/end dates to midnight before generating date range. name : str, default None Name of the resulting DatetimeIndex. - closed : {None, 'left', 'right'}, optional - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None, the default). - - .. deprecated:: 1.4.0 - Argument `closed` has been deprecated to standardize boundary inputs. - Use `inclusive` instead, to set each bound as closed or open. inclusive : {"both", "neither", "left", "right"}, default "both" Include boundaries; Whether to set each bound as closed or open. @@ -987,28 +973,6 @@ def date_range( DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') """ - if inclusive is not None and closed is not lib.no_default: - raise ValueError( - "Deprecated argument `closed` cannot be passed" - "if argument `inclusive` is not None" - ) - if closed is not lib.no_default: - warnings.warn( - "Argument `closed` is deprecated in favor of `inclusive`.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if closed is None: - inclusive = "both" - elif closed in ("left", "right"): - inclusive = closed - else: - raise ValueError( - "Argument `closed` has to be either 'left', 'right' or None" - ) - elif inclusive is None: - inclusive = "both" - if freq is None and com.any_none(periods, start, end): freq = "D" @@ -1035,8 +999,7 @@ def bdate_range( name: Hashable = None, weekmask=None, holidays=None, - closed: IntervalLeftRight | lib.NoDefault | None = lib.no_default, - inclusive: IntervalClosedType | None = None, + inclusive: IntervalClosedType = "both", **kwargs, ) -> DatetimeIndex: """ @@ -1068,13 +1031,6 @@ def bdate_range( Dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar``, only used when custom frequency strings are passed. - closed : str, default None - Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None). - - .. deprecated:: 1.4.0 - Argument `closed` has been deprecated to standardize boundary inputs. - Use `inclusive` instead, to set each bound as closed or open. inclusive : {"both", "neither", "left", "right"}, default "both" Include boundaries; Whether to set each bound as closed or open. @@ -1131,7 +1087,6 @@ def bdate_range( tz=tz, normalize=normalize, name=name, - closed=closed, inclusive=inclusive, **kwargs, ) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 499c5c2afed4c..f17e2a197a82b 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -455,15 +455,10 @@ def test_getitem_datetime_slice(self): ] ), ) - with tm.assert_produces_warning(FutureWarning): - result = df["2011-01-01":"2011-11-01"] - expected = DataFrame( - {"a": 0}, - index=DatetimeIndex( - ["11.01.2011 22:00", "11.01.2011 23:00", "2011-01-13 00:00"] - ), - ) - tm.assert_frame_equal(result, expected) + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + df["2011-01-01":"2011-11-01"] class TestGetitemDeprecatedIndexers: diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index cdf78c97c45b5..b4ef62604d888 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -376,23 +376,24 @@ def test_partial_slicing_with_multiindex_series(self): result = df2.loc[Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected) - def test_partial_slice_doesnt_require_monotonicity(self): - # For historical reasons. + def test_partial_slice_requires_monotonicity(self): + # Disallowed since 2.0 (GH 37819) ser = Series(np.arange(10), date_range("2014-01-01", periods=10)) nonmonotonic = ser[[3, 5, 4]] - expected = nonmonotonic.iloc[:0] timestamp = Timestamp("2014-01-10") - with tm.assert_produces_warning(FutureWarning): - result = nonmonotonic["2014-01-10":] - tm.assert_series_equal(result, expected) + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + nonmonotonic["2014-01-10":] with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic[timestamp:] - with tm.assert_produces_warning(FutureWarning): - result = nonmonotonic.loc["2014-01-10":] - tm.assert_series_equal(result, expected) + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): + nonmonotonic.loc["2014-01-10":] with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic.loc[timestamp:] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3b75f9d7ce1be..0068a0a0ded67 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2460,7 +2460,9 @@ def test_loc_getitem_slice_unordered_dt_index(self, frame_or_series, start): [1, 2, 3], index=[Timestamp("2016"), Timestamp("2019"), Timestamp("2017")], ) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises( + KeyError, match="Value based partial slicing on non-monotonic" + ): obj.loc[start:"2022"] @pytest.mark.parametrize("value", [1, 1.5]) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 54b9d26b614c8..b977c78c635da 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -385,7 +385,7 @@ def compare(slobj): tm.assert_series_equal(result, expected) compare(slice("2011-01-01", "2011-01-15")) - with tm.assert_produces_warning(FutureWarning): + with pytest.raises(KeyError, match="Value based partial slicing on non-monotonic"): compare(slice("2010-12-30", "2011-01-15")) compare(slice("2011-01-01", "2011-01-16")) From d05207acb27a57b831bb117b50bf66c4af84301f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Nov 2022 11:36:13 -0800 Subject: [PATCH 22/39] TST/CI: Follow up fix test_write_fspath_all (#49621) --- pandas/tests/io/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 5b3b5602c95bc..1945012f93b42 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -361,7 +361,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): writer(string, **writer_kwargs) writer(mypath, **writer_kwargs) with open(string, "rb") as f_str, open(fspath, "rb") as f_path: - if writer == "to_excel": + if writer_name == "to_excel": # binary representation of excel contains time creation # data that causes flaky CI failures result = pd.read_excel(f_str, **writer_kwargs) From 74cd050f6c1c6db1131283c1905f6478b136a8b3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Nov 2022 11:39:19 -0800 Subject: [PATCH 23/39] CLN: test_nanops.py (#49423) --- pandas/tests/test_nanops.py | 487 +++++++++++++++++++++++++----------- 1 file changed, 334 insertions(+), 153 deletions(-) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 0e64181bd46a7..ae8791a774ed5 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -29,6 +29,175 @@ def skipna(request): return request.param +@pytest.fixture +def disable_bottleneck(monkeypatch): + with monkeypatch.context() as m: + m.setattr(nanops, "_USE_BOTTLENECK", False) + yield + + +@pytest.fixture +def arr_shape(): + return 11, 7 + + +@pytest.fixture +def arr_float(arr_shape): + np.random.seed(11235) + return np.random.randn(*arr_shape) + + +@pytest.fixture +def arr_complex(arr_float): + return arr_float + arr_float * 1j + + +@pytest.fixture +def arr_int(arr_shape): + np.random.seed(11235) + return np.random.randint(-10, 10, arr_shape) + + +@pytest.fixture +def arr_bool(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 2, arr_shape) == 0 + + +@pytest.fixture +def arr_str(arr_float): + return np.abs(arr_float).astype("S") + + +@pytest.fixture +def arr_utf(arr_float): + return np.abs(arr_float).astype("U") + + +@pytest.fixture +def arr_date(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 20000, arr_shape).astype("M8[ns]") + + +@pytest.fixture +def arr_tdelta(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 20000, arr_shape).astype("m8[ns]") + + +@pytest.fixture +def arr_nan(arr_shape): + return np.tile(np.nan, arr_shape) + + +@pytest.fixture +def arr_float_nan(arr_float, arr_nan): + return np.vstack([arr_float, arr_nan]) + + +@pytest.fixture +def arr_nan_float1(arr_nan, arr_float): + return np.vstack([arr_nan, arr_float]) + + +@pytest.fixture +def arr_nan_nan(arr_nan): + return np.vstack([arr_nan, arr_nan]) + + +@pytest.fixture +def arr_inf(arr_float): + return arr_float * np.inf + + +@pytest.fixture +def arr_float_inf(arr_float, arr_inf): + return np.vstack([arr_float, arr_inf]) + + +@pytest.fixture +def arr_nan_inf(arr_nan, arr_inf): + return np.vstack([arr_nan, arr_inf]) + + +@pytest.fixture +def arr_float_nan_inf(arr_float, arr_nan, arr_inf): + return np.vstack([arr_float, arr_nan, arr_inf]) + + +@pytest.fixture +def arr_nan_nan_inf(arr_nan, arr_inf): + return np.vstack([arr_nan, arr_nan, arr_inf]) + + +@pytest.fixture +def arr_obj( + arr_float, arr_int, arr_bool, arr_complex, arr_str, arr_utf, arr_date, arr_tdelta +): + return np.vstack( + [ + arr_float.astype("O"), + arr_int.astype("O"), + arr_bool.astype("O"), + arr_complex.astype("O"), + arr_str.astype("O"), + arr_utf.astype("O"), + arr_date.astype("O"), + arr_tdelta.astype("O"), + ] + ) + + +@pytest.fixture +def arr_nan_nanj(arr_nan): + with np.errstate(invalid="ignore"): + return arr_nan + arr_nan * 1j + + +@pytest.fixture +def arr_complex_nan(arr_complex, arr_nan_nanj): + with np.errstate(invalid="ignore"): + return np.vstack([arr_complex, arr_nan_nanj]) + + +@pytest.fixture +def arr_nan_infj(arr_inf): + with np.errstate(invalid="ignore"): + return arr_inf * 1j + + +@pytest.fixture +def arr_complex_nan_infj(arr_complex, arr_nan_infj): + with np.errstate(invalid="ignore"): + return np.vstack([arr_complex, arr_nan_infj]) + + +@pytest.fixture +def arr_float_1d(arr_float): + return arr_float[:, 0] + + +@pytest.fixture +def arr_nan_1d(arr_nan): + return arr_nan[:, 0] + + +@pytest.fixture +def arr_float_nan_1d(arr_float_nan): + return arr_float_nan[:, 0] + + +@pytest.fixture +def arr_float1_nan_1d(arr_float1_nan): + return arr_float1_nan[:, 0] + + +@pytest.fixture +def arr_nan_float1_1d(arr_nan_float1): + return arr_nan_float1[:, 0] + + class TestnanopsDataFrame: def setup_method(self): np.random.seed(11235) @@ -299,45 +468,6 @@ def test_nanmean(self, skipna): nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False ) - @pytest.mark.parametrize("val", [2**55, -(2**55), 20150515061816532]) - def test_nanmean_overflow(self, val): - # GH 10155 - # In the previous implementation mean can overflow for int dtypes, it - # is now consistent with numpy - - ser = Series(val, index=range(500), dtype=np.int64) - result = ser.mean() - np_result = ser.values.mean() - assert result == val - assert result == np_result - assert result.dtype == np.float64 - - @pytest.mark.parametrize( - "dtype", - [ - np.int16, - np.int32, - np.int64, - np.float32, - np.float64, - getattr(np, "float128", None), - ], - ) - def test_returned_dtype(self, dtype): - if dtype is None: - # no float128 available - return - - ser = Series(range(10), dtype=dtype) - group_a = ["mean", "std", "var", "skew", "kurt"] - group_b = ["min", "max"] - for method in group_a + group_b: - result = getattr(ser, method)() - if is_integer_dtype(dtype) and method in group_a: - assert result.dtype == np.float64 - else: - assert result.dtype == dtype - def test_nanmedian(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) @@ -623,124 +753,137 @@ def test_nancov(self): targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1) - @pytest.mark.parametrize( - "op,nanop", - [ - (operator.eq, nanops.naneq), - (operator.ne, nanops.nanne), - (operator.gt, nanops.nangt), - (operator.ge, nanops.nange), - (operator.lt, nanops.nanlt), - (operator.le, nanops.nanle), - ], - ) - def test_nan_comparison(self, op, nanop): - targ0 = op(self.arr_float, self.arr_float1) - arr_float = self.arr_float - arr_float1 = self.arr_float1 - arr_nan = self.arr_nan - arr_nan_nan = self.arr_nan_nan - arr_float_nan = self.arr_float_nan - arr_float1_nan = self.arr_float1_nan - arr_nan_float1 = self.arr_nan_float1 - - while targ0.ndim: - res0 = nanop(arr_float, arr_float1) - tm.assert_almost_equal(targ0, res0) - - if targ0.ndim > 1: - targ1 = np.vstack([targ0, arr_nan]) - else: - targ1 = np.hstack([targ0, arr_nan]) - res1 = nanop(arr_float_nan, arr_float1_nan) - tm.assert_numpy_array_equal(targ1, res1, check_dtype=False) - - targ2 = arr_nan_nan - res2 = nanop(arr_float_nan, arr_nan_float1) - tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) - - # Lower dimension for next step in the loop - arr_float = np.take(arr_float, 0, axis=-1) - arr_float1 = np.take(arr_float1, 0, axis=-1) - arr_nan = np.take(arr_nan, 0, axis=-1) - arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) - arr_float_nan = np.take(arr_float_nan, 0, axis=-1) - arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) - arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) - targ0 = np.take(targ0, 0, axis=-1) - @pytest.mark.parametrize( - "arr, correct", - [ - ("arr_complex", False), - ("arr_int", False), - ("arr_bool", False), - ("arr_str", False), - ("arr_utf", False), - ("arr_complex", False), - ("arr_complex_nan", False), - ("arr_nan_nanj", False), - ("arr_nan_infj", True), - ("arr_complex_nan_infj", True), - ], - ) - def test__has_infs_non_float(self, arr, correct): - val = getattr(self, arr) - while getattr(val, "ndim", True): - res0 = nanops._has_infs(val) - if correct: - assert res0 - else: - assert not res0 +@pytest.mark.parametrize( + "op,nanop", + [ + (operator.eq, nanops.naneq), + (operator.ne, nanops.nanne), + (operator.gt, nanops.nangt), + (operator.ge, nanops.nange), + (operator.lt, nanops.nanlt), + (operator.le, nanops.nanle), + ], +) +def test_nan_comparison(request, op, nanop, disable_bottleneck): + arr_float = request.getfixturevalue("arr_float") + arr_float1 = request.getfixturevalue("arr_float") + targ0 = op(arr_float, arr_float1) + arr_nan = request.getfixturevalue("arr_nan") + arr_nan_nan = request.getfixturevalue("arr_nan_nan") + arr_float_nan = request.getfixturevalue("arr_float_nan") + arr_float1_nan = request.getfixturevalue("arr_float_nan") + arr_nan_float1 = request.getfixturevalue("arr_nan_float1") + + while targ0.ndim: + res0 = nanop(arr_float, arr_float1) + tm.assert_almost_equal(targ0, res0) + + if targ0.ndim > 1: + targ1 = np.vstack([targ0, arr_nan]) + else: + targ1 = np.hstack([targ0, arr_nan]) + res1 = nanop(arr_float_nan, arr_float1_nan) + tm.assert_numpy_array_equal(targ1, res1, check_dtype=False) + + targ2 = arr_nan_nan + res2 = nanop(arr_float_nan, arr_nan_float1) + tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) + + # Lower dimension for next step in the loop + arr_float = np.take(arr_float, 0, axis=-1) + arr_float1 = np.take(arr_float1, 0, axis=-1) + arr_nan = np.take(arr_nan, 0, axis=-1) + arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) + arr_float_nan = np.take(arr_float_nan, 0, axis=-1) + arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) + arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) + targ0 = np.take(targ0, 0, axis=-1) - if not hasattr(val, "ndim"): - break - # Reduce dimension for next step in the loop - val = np.take(val, 0, axis=-1) +@pytest.mark.parametrize( + "arr, correct", + [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", False), + ("arr_nan_nanj", False), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ], +) +def test_has_infs_non_float(request, arr, correct, disable_bottleneck): + val = request.getfixturevalue(arr) + while getattr(val, "ndim", True): + res0 = nanops._has_infs(val) + if correct: + assert res0 + else: + assert not res0 + + if not hasattr(val, "ndim"): + break + + # Reduce dimension for next step in the loop + val = np.take(val, 0, axis=-1) - @pytest.mark.parametrize( - "arr, correct", - [ - ("arr_float", False), - ("arr_nan", False), - ("arr_float_nan", False), - ("arr_nan_nan", False), - ("arr_float_inf", True), - ("arr_inf", True), - ("arr_nan_inf", True), - ("arr_float_nan_inf", True), - ("arr_nan_nan_inf", True), - ], - ) - @pytest.mark.parametrize("astype", [None, "f4", "f2"]) - def test__has_infs_floats(self, arr, correct, astype): - val = getattr(self, arr) - if astype is not None: - val = val.astype(astype) - while getattr(val, "ndim", True): - res0 = nanops._has_infs(val) - if correct: - assert res0 - else: - assert not res0 - if not hasattr(val, "ndim"): - break +@pytest.mark.parametrize( + "arr, correct", + [ + ("arr_float", False), + ("arr_nan", False), + ("arr_float_nan", False), + ("arr_nan_nan", False), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ], +) +@pytest.mark.parametrize("astype", [None, "f4", "f2"]) +def test_has_infs_floats(request, arr, correct, astype, disable_bottleneck): + val = request.getfixturevalue(arr) + if astype is not None: + val = val.astype(astype) + while getattr(val, "ndim", True): + res0 = nanops._has_infs(val) + if correct: + assert res0 + else: + assert not res0 - # Reduce dimension for next step in the loop - val = np.take(val, 0, axis=-1) + if not hasattr(val, "ndim"): + break - def test__bn_ok_dtype(self): - assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_int.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_str.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test") + # Reduce dimension for next step in the loop + val = np.take(val, 0, axis=-1) + + +@pytest.mark.parametrize( + "fixture", ["arr_float", "arr_complex", "arr_int", "arr_bool", "arr_str", "arr_utf"] +) +def test_bn_ok_dtype(fixture, request, disable_bottleneck): + obj = request.getfixturevalue(fixture) + assert nanops._bn_ok_dtype(obj.dtype, "test") + + +@pytest.mark.parametrize( + "fixture", + [ + "arr_date", + "arr_tdelta", + "arr_obj", + ], +) +def test_bn_not_ok_dtype(fixture, request, disable_bottleneck): + obj = request.getfixturevalue(fixture) + assert not nanops._bn_ok_dtype(obj.dtype, "test") class TestEnsureNumeric: @@ -1111,7 +1254,7 @@ def test_nanops_independent_of_mask_param(operation): @pytest.mark.parametrize("min_count", [-1, 0]) -def test_check_below_min_count__negative_or_zero_min_count(min_count): +def test_check_below_min_count_negative_or_zero_min_count(min_count): # GH35227 result = nanops.check_below_min_count((21, 37), None, min_count) expected_result = False @@ -1122,7 +1265,7 @@ def test_check_below_min_count__negative_or_zero_min_count(min_count): "mask", [None, np.array([False, False, True]), np.array([True] + 9 * [False])] ) @pytest.mark.parametrize("min_count, expected_result", [(1, False), (101, True)]) -def test_check_below_min_count__positive_min_count(mask, min_count, expected_result): +def test_check_below_min_count_positive_min_count(mask, min_count, expected_result): # GH35227 shape = (10, 10) result = nanops.check_below_min_count(shape, mask, min_count) @@ -1132,7 +1275,7 @@ def test_check_below_min_count__positive_min_count(mask, min_count, expected_res @td.skip_if_windows @td.skip_if_32bit @pytest.mark.parametrize("min_count, expected_result", [(1, False), (2812191852, True)]) -def test_check_below_min_count__large_shape(min_count, expected_result): +def test_check_below_min_count_large_shape(min_count, expected_result): # GH35227 large shape used to show that the issue is fixed shape = (2244367, 1253) result = nanops.check_below_min_count(shape, mask=None, min_count=min_count) @@ -1143,3 +1286,41 @@ def test_check_below_min_count__large_shape(min_count, expected_result): def test_check_bottleneck_disallow(any_real_numpy_dtype, func): # GH 42878 bottleneck sometimes produces unreliable results for mean and sum assert not nanops._bn_ok_dtype(np.dtype(any_real_numpy_dtype).type, func) + + +@pytest.mark.parametrize("val", [2**55, -(2**55), 20150515061816532]) +def test_nanmean_overflow(disable_bottleneck, val): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + + ser = Series(val, index=range(500), dtype=np.int64) + result = ser.mean() + np_result = ser.values.mean() + assert result == val + assert result == np_result + assert result.dtype == np.float64 + + +@pytest.mark.parametrize( + "dtype", + [ + np.int16, + np.int32, + np.int64, + np.float32, + np.float64, + getattr(np, "float128", None), + ], +) +@pytest.mark.parametrize("method", ["mean", "std", "var", "skew", "kurt", "min", "max"]) +def test_returned_dtype(disable_bottleneck, dtype, method): + if dtype is None: + pytest.skip("np.float128 not available") + + ser = Series(range(10), dtype=dtype) + result = getattr(ser, method)() + if is_integer_dtype(dtype) and method not in ["min", "max"]: + assert result.dtype == np.float64 + else: + assert result.dtype == dtype From 0533e094364a5b0889b173475d2ed55f63bec25b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:32:43 -0500 Subject: [PATCH 24/39] REGR: Better warning in pivot_table when dropping nuisance columns (#49615) * REGR: Better warning in pivot_table when dropping nuisance columns * type-hint fixups --- pandas/core/reshape/pivot.py | 14 ++++++- pandas/tests/reshape/test_pivot.py | 8 ++-- pandas/tests/util/test_rewrite_warning.py | 39 ++++++++++++++++++++ pandas/util/_exceptions.py | 45 +++++++++++++++++++++++ 4 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/util/test_rewrite_warning.py diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 37e78c7dbf7a2..810a428098df2 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -21,6 +21,7 @@ Appender, Substitution, ) +from pandas.util._exceptions import rewrite_warning from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( @@ -164,7 +165,18 @@ def __internal_pivot_table( values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort) - agged = grouped.agg(aggfunc) + msg = ( + "pivot_table dropped a column because it failed to aggregate. This behavior " + "is deprecated and will raise in a future version of pandas. Select only the " + "columns that can be aggregated." + ) + with rewrite_warning( + target_message="The default value of numeric_only", + target_category=FutureWarning, + new_message=msg, + ): + agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 14ea670fa6cf9..f9119ea43160b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -147,7 +147,7 @@ def test_pivot_table_nocols(self): df = DataFrame( {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} ) - msg = "The default value of numeric_only" + msg = "pivot_table dropped a column because it failed to aggregate" with tm.assert_produces_warning(FutureWarning, match=msg): rs = df.pivot_table(columns="cols", aggfunc=np.sum) xp = df.pivot_table(index="cols", aggfunc=np.sum).T @@ -911,7 +911,7 @@ def test_no_col(self, data): # to help with a buglet data.columns = [k * 2 for k in data.columns] - msg = "The default value of numeric_only" + msg = "pivot_table dropped a column because it failed to aggregate" with tm.assert_produces_warning(FutureWarning, match=msg): table = data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) for value_col in table.columns: @@ -975,7 +975,7 @@ def test_margin_with_only_columns_defined( } ) - msg = "The default value of numeric_only" + msg = "pivot_table dropped a column because it failed to aggregate" with tm.assert_produces_warning(FutureWarning, match=msg): result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) @@ -2004,7 +2004,7 @@ def test_pivot_string_func_vs_func(self, f, f_numpy, data): # GH #18713 # for consistency purposes - msg = "The default value of numeric_only" + msg = "pivot_table dropped a column because it failed to aggregate" with tm.assert_produces_warning(FutureWarning, match=msg): result = pivot_table(data, index="A", columns="B", aggfunc=f) expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy) diff --git a/pandas/tests/util/test_rewrite_warning.py b/pandas/tests/util/test_rewrite_warning.py new file mode 100644 index 0000000000000..f847a06d8ea8d --- /dev/null +++ b/pandas/tests/util/test_rewrite_warning.py @@ -0,0 +1,39 @@ +import warnings + +import pytest + +from pandas.util._exceptions import rewrite_warning + +import pandas._testing as tm + + +@pytest.mark.parametrize( + "target_category, target_message, hit", + [ + (FutureWarning, "Target message", True), + (FutureWarning, "Target", True), + (FutureWarning, "get mess", True), + (FutureWarning, "Missed message", False), + (DeprecationWarning, "Target message", False), + ], +) +@pytest.mark.parametrize( + "new_category", + [ + None, + DeprecationWarning, + ], +) +def test_rewrite_warning(target_category, target_message, hit, new_category): + new_message = "Rewritten message" + if hit: + expected_category = new_category if new_category else target_category + expected_message = new_message + else: + expected_category = FutureWarning + expected_message = "Target message" + with tm.assert_produces_warning(expected_category, match=expected_message): + with rewrite_warning( + target_message, target_category, new_message, new_category + ): + warnings.warn(message="Target message", category=FutureWarning) diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index f3a640feb46fc..1eefd06a133fb 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -3,7 +3,9 @@ import contextlib import inspect import os +import re from typing import Generator +import warnings @contextlib.contextmanager @@ -47,3 +49,46 @@ def find_stack_level() -> int: else: break return n + + +@contextlib.contextmanager +def rewrite_warning( + target_message: str, + target_category: type[Warning], + new_message: str, + new_category: type[Warning] | None = None, +) -> Generator[None, None, None]: + """ + Rewrite the message of a warning. + + Parameters + ---------- + target_message : str + Warning message to match. + target_category : Warning + Warning type to match. + new_message : str + New warning message to emit. + new_category : Warning or None, default None + New warning type to emit. When None, will be the same as target_category. + """ + if new_category is None: + new_category = target_category + with warnings.catch_warnings(record=True) as record: + yield + if len(record) > 0: + match = re.compile(target_message) + for warning in record: + if warning.category is target_category and re.search( + match, str(warning.message) + ): + category = new_category + message: Warning | str = new_message + else: + category, message = warning.category, warning.message + warnings.warn_explicit( + message=message, + category=category, + filename=warning.filename, + lineno=warning.lineno, + ) From 9d15690fda2d2892eb5acf2af589c7d5094cada0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 10 Nov 2022 21:33:22 +0100 Subject: [PATCH 25/39] REGR: MultiIndex.join does not work for ea dtypes (#49284) * REGR: MultiIndex.join does not work for ea dtypes * Update base.py --- doc/source/whatsnew/v1.5.2.rst | 1 + pandas/core/indexes/base.py | 6 ++-- pandas/tests/indexes/multi/test_join.py | 48 +++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.2.rst b/doc/source/whatsnew/v1.5.2.rst index e65be3bcecd76..572d6c74e767f 100644 --- a/doc/source/whatsnew/v1.5.2.rst +++ b/doc/source/whatsnew/v1.5.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`MultiIndex.join` for extension array dtypes (:issue:`49277`) - Fixed regression in :meth:`Series.replace` raising ``RecursionError`` with numeric dtype and when specifying ``value=None`` (:issue:`45725`) - Fixed regression in :meth:`DataFrame.plot` preventing :class:`~matplotlib.colors.Colormap` instance from being passed using the ``colormap`` argument if Matplotlib 3.6+ is used (:issue:`49374`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4fbf162e4ae12..e7b1ad1a48bd6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4460,8 +4460,10 @@ def join( return self._join_non_unique(other, how=how) elif not self.is_unique or not other.is_unique: if self.is_monotonic_increasing and other.is_monotonic_increasing: - if self._can_use_libjoin: + if not is_interval_dtype(self.dtype): # otherwise we will fall through to _join_via_get_indexer + # GH#39133 + # go through object dtype for ea till engine is supported properly return self._join_monotonic(other, how=how) else: return self._join_non_unique(other, how=how) @@ -4836,7 +4838,7 @@ def _wrap_joined_index(self: _IndexT, joined: ArrayLike, other: _IndexT) -> _Ind return self._constructor(joined, name=name) # type: ignore[return-value] else: name = get_op_result_name(self, other) - return self._constructor._with_infer(joined, name=name) + return self._constructor._with_infer(joined, name=name, dtype=self.dtype) @cache_readonly def _can_use_libjoin(self) -> bool: diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 23d5325dde2bb..aa2f2ca5af7bd 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -6,6 +6,8 @@ Index, Interval, MultiIndex, + Series, + StringDtype, ) import pandas._testing as tm @@ -161,6 +163,52 @@ def test_join_overlapping_interval_level(): tm.assert_index_equal(result, expected) +def test_join_midx_ea(): + # GH#49277 + midx = MultiIndex.from_arrays( + [Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")], + names=["a", "b"], + ) + midx2 = MultiIndex.from_arrays( + [Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"] + ) + result = midx.join(midx2, how="inner") + expected = MultiIndex.from_arrays( + [ + Series([1, 1], dtype="Int64"), + Series([1, 2], dtype="Int64"), + Series([3, 3], dtype="Int64"), + ], + names=["a", "b", "c"], + ) + tm.assert_index_equal(result, expected) + + +def test_join_midx_string(): + # GH#49277 + midx = MultiIndex.from_arrays( + [ + Series(["a", "a", "c"], dtype=StringDtype()), + Series(["a", "b", "c"], dtype=StringDtype()), + ], + names=["a", "b"], + ) + midx2 = MultiIndex.from_arrays( + [Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())], + names=["a", "c"], + ) + result = midx.join(midx2, how="inner") + expected = MultiIndex.from_arrays( + [ + Series(["a", "a"], dtype=StringDtype()), + Series(["a", "b"], dtype=StringDtype()), + Series(["c", "c"], dtype=StringDtype()), + ], + names=["a", "b", "c"], + ) + tm.assert_index_equal(result, expected) + + def test_join_multi_with_nan(): # GH29252 df1 = DataFrame( From a835fba2d2e927bbbb5b5976514343a82232caab Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 10 Nov 2022 15:34:48 -0500 Subject: [PATCH 26/39] BUG: groupby with sort=False still sorts an ordered categorical (#49613) * BUG: groupby with sort=False still sorts an ordered categorical * Add versionchanged --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/groupby/categorical.py | 4 +- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/grouper.py | 2 +- pandas/core/shared_docs.py | 6 + pandas/tests/groupby/test_categorical.py | 155 ++++++----------------- 6 files changed, 55 insertions(+), 118 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 2ae50cfe4e137..151d853166563 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -651,6 +651,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`) - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`) +- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 0a8e12caead1c..20248cd69bfb9 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -53,7 +53,7 @@ def recode_for_groupby( unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] - if c.ordered or sort: + if sort: take_codes = np.sort(take_codes) # we recode according to the uniques @@ -75,7 +75,7 @@ def recode_for_groupby( all_codes = np.arange(c.categories.nunique()) # GH 38140: exclude nan from indexer for categories unique_notnan_codes = unique1d(c.codes[c.codes != -1]) - if c.ordered: + if sort: unique_notnan_codes = np.sort(unique_notnan_codes) if len(all_codes) > len(unique_notnan_codes): # GH 13179: All categories need to be present, even if missing from the data diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e0baaaeb3c8f9..f1c18b7762f66 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4116,7 +4116,9 @@ def _reindex_output( # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index" levels_list.append(qs) # type: ignore[arg-type] names = names + [None] - index, _ = MultiIndex.from_product(levels_list, names=names).sortlevel() + index = MultiIndex.from_product(levels_list, names=names) + if self.sort: + index = index.sortlevel()[0] if self.as_index: # Always holds for SeriesGroupBy unless GH#36507 is implemented diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7da7ea119cea3..688dcb44c31f3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -655,7 +655,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] - if self._sort or cat.ordered: + if self._sort: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index cfabe05ec9e3b..07dc203e556e8 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -119,6 +119,12 @@ Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. + + .. versionchanged:: 2.0.0 + + Specifying ``sort=False`` with an ordered categorical grouper will no + longer sort the values. + group_keys : bool, optional When calling apply and the ``by`` argument produces a like-indexed (i.e. :ref:`a transform `) result, add group keys to diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8fe1dc010211a..1e2bcb58110dd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -818,12 +818,14 @@ def test_preserve_categories(): # ordered=True df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)}) - index = CategoricalIndex(categories, categories, ordered=True, name="A") + sort_index = CategoricalIndex(categories, categories, ordered=True, name="A") + nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A") tm.assert_index_equal( - df.groupby("A", sort=True, observed=False).first().index, index + df.groupby("A", sort=True, observed=False).first().index, sort_index ) + # GH#42482 - don't sort result when sort=False, even when ordered=True tm.assert_index_equal( - df.groupby("A", sort=False, observed=False).first().index, index + df.groupby("A", sort=False, observed=False).first().index, nosort_index ) # ordered=False @@ -972,8 +974,11 @@ def test_sort(): tm.assert_series_equal(res, exp) -def test_sort2(): +@pytest.mark.parametrize("ordered", [True, False]) +def test_sort2(sort, ordered): # dataframe groupby sort was being ignored # GH 8868 + # GH#48749 - don't change order of categories + # GH#42482 - don't sort result when sort=False, even when ordered=True df = DataFrame( [ ["(7.5, 10]", 10, 10], @@ -986,53 +991,28 @@ def test_sort2(): ], columns=["range", "foo", "bar"], ) - df["range"] = Categorical(df["range"], ordered=True) - index = CategoricalIndex( - ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True - ) - expected_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index - ) - - col = "range" - result_sort = df.groupby(col, sort=True, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) - - # when categories is ordered, group is ordered by category's order - expected_sort = result_sort - result_sort = df.groupby(col, sort=False, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) + df["range"] = Categorical(df["range"], ordered=ordered) + result = df.groupby("range", sort=sort, observed=False).first() - df["range"] = Categorical(df["range"], ordered=False) - index = CategoricalIndex( - ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range" - ) - expected_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index - ) - - index = CategoricalIndex( - ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], - # GH#48749 - don't change order of categories - categories=["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], - name="range", - ) - expected_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"] + if sort: + data_values = [[1, 60], [5, 30], [6, 40], [10, 10]] + index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"] + else: + data_values = [[10, 10], [5, 30], [6, 40], [1, 60]] + index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"] + expected = DataFrame( + data_values, + columns=["foo", "bar"], + index=CategoricalIndex(index_values, name="range", ordered=ordered), ) - col = "range" - - # this is an unordered categorical, but we allow this #### - result_sort = df.groupby(col, sort=True, observed=False).first() - tm.assert_frame_equal(result_sort, expected_sort) - - result_nosort = df.groupby(col, sort=False, observed=False).first() - tm.assert_frame_equal(result_nosort, expected_nosort) + tm.assert_frame_equal(result, expected) -def test_sort_datetimelike(): +@pytest.mark.parametrize("ordered", [True, False]) +def test_sort_datetimelike(sort, ordered): # GH10505 + # GH#42482 - don't sort result when sort=False, even when ordered=True # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month @@ -1054,80 +1034,30 @@ def test_sort_datetimelike(): ) # ordered=True - df["dt"] = Categorical(df["dt"], ordered=True) - index = [ - datetime(2011, 1, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 7, 1), - ] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] - ) - result_sort.index = CategoricalIndex(index, name="dt", ordered=True) - - index = [ - datetime(2011, 7, 1), - datetime(2011, 2, 1), - datetime(2011, 5, 1), - datetime(2011, 1, 1), - ] - result_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] - ) - result_nosort.index = CategoricalIndex( - index, categories=index, name="dt", ordered=True - ) - - col = "dt" - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first() - ) - - # when categories is ordered, group is ordered by category's order - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=False, observed=False).first() - ) - - # ordered = False - df["dt"] = Categorical(df["dt"], ordered=False) - sort_index = CategoricalIndex( - [ + df["dt"] = Categorical(df["dt"], ordered=ordered) + if sort: + data_values = [[1, 60], [5, 30], [6, 40], [10, 10]] + index_values = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), - ], - name="dt", - ) - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=sort_index - ) - - nosort_index = CategoricalIndex( - [ + ] + else: + data_values = [[10, 10], [5, 30], [6, 40], [1, 60]] + index_values = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), - ], - # GH#48749 - don't change order of categories - categories=sort_index.categories, - name="dt", - ) - result_nosort = DataFrame( - [[10, 10], [5, 30], [6, 40], [1, 60]], + ] + expected = DataFrame( + data_values, columns=["foo", "bar"], - index=nosort_index, - ) - - col = "dt" - tm.assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first() - ) - tm.assert_frame_equal( - result_nosort, df.groupby(col, sort=False, observed=False).first() + index=CategoricalIndex(index_values, name="dt", ordered=ordered), ) + result = df.groupby("dt", sort=sort, observed=False).first() + tm.assert_frame_equal(result, expected) def test_empty_sum(): @@ -2055,13 +1985,10 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_many_categories(request, as_index, sort, index_kind, ordered): +def test_many_categories(as_index, sort, index_kind, ordered): # GH#48749 - Test when the grouper has many categories if index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") - if index_kind == "multi" and as_index and not sort and ordered: - msg = "GH#48749 - values are unsorted even though the Categorical is ordered" - request.node.add_marker(pytest.mark.xfail(reason=msg)) categories = np.arange(9999, -1, -1) grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered) df = DataFrame({"a": grouper, "b": range(4)}) @@ -2078,7 +2005,7 @@ def test_many_categories(request, as_index, sort, index_kind, ordered): result = gb.sum() # Test is setup so that data and index are the same values - data = [3, 2, 1] if sort or ordered else [2, 1, 3] + data = [3, 2, 1] if sort else [2, 1, 3] index = CategoricalIndex( data, categories=grouper.categories, ordered=ordered, name="a" From 07dba4f8c32a33024b620b0261a3f02513521498 Mon Sep 17 00:00:00 2001 From: Douglas Lohmann Date: Thu, 10 Nov 2022 17:36:39 -0300 Subject: [PATCH 27/39] BUG: date_range with freq="C" (business days) return value changed on 1.5.0 (#49610) BUG: Use naive wall time to perform offsets datetime64 conversion --- doc/source/whatsnew/v1.5.2.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 4 +++- .../tests/indexes/datetimes/test_date_range.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.2.rst b/doc/source/whatsnew/v1.5.2.rst index 572d6c74e767f..446235d1656dc 100644 --- a/doc/source/whatsnew/v1.5.2.rst +++ b/doc/source/whatsnew/v1.5.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression in :meth:`Series.replace` raising ``RecursionError`` with numeric dtype and when specifying ``value=None`` (:issue:`45725`) - Fixed regression in :meth:`DataFrame.plot` preventing :class:`~matplotlib.colors.Colormap` instance from being passed using the ``colormap`` argument if Matplotlib 3.6+ is used (:issue:`49374`) +- Fixed regression in :func:`date_range` returning an invalid set of periods for ``CustomBusinessDay`` frequency and ``start`` date with timezone (:issue:`49441`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 50d6a0a02b0cf..8e022ac662d21 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -254,7 +254,9 @@ cdef _to_dt64D(dt): if getattr(dt, 'tzinfo', None) is not None: # Get the nanosecond timestamp, # equiv `Timestamp(dt).value` or `dt.timestamp() * 10**9` - naive = dt.astimezone(None) + # The `naive` must be the `dt` naive wall time + # instead of the naive absolute time (GH#49441) + naive = dt.replace(tzinfo=None) dt = np.datetime64(naive, "D") else: dt = np.datetime64(dt) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 142679e292b38..adbf6c715fef6 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1151,6 +1151,24 @@ def test_range_with_millisecond_resolution(self, start_end): expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "start,period,expected", + [ + ("2022-07-23 00:00:00+02:00", 1, ["2022-07-25 00:00:00+02:00"]), + ("2022-07-22 00:00:00+02:00", 1, ["2022-07-22 00:00:00+02:00"]), + ( + "2022-07-22 00:00:00+02:00", + 2, + ["2022-07-22 00:00:00+02:00", "2022-07-25 00:00:00+02:00"], + ), + ], + ) + def test_range_with_timezone_and_custombusinessday(self, start, period, expected): + # GH49441 + result = date_range(start=start, periods=period, freq="C") + expected = DatetimeIndex(expected) + tm.assert_index_equal(result, expected) + def test_date_range_with_custom_holidays(): # GH 30593 From 0daeb6ab952d0e74f3b11db5f5f01d90180ea6ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Nov 2022 12:56:42 -0800 Subject: [PATCH 28/39] API: make Timestamp/Timedelta _as_unit public as_unit (#48819) * API: make Timestamp/Timedelta _as_unit public as_unit * update test * update test * update tests * fix pyi typo * fixup * fixup --- doc/source/reference/arrays.rst | 4 ++ pandas/_libs/tslib.pyx | 4 +- pandas/_libs/tslibs/nattype.pyi | 1 + pandas/_libs/tslibs/nattype.pyx | 16 +++++++ pandas/_libs/tslibs/timedeltas.pyi | 4 +- pandas/_libs/tslibs/timedeltas.pyx | 21 ++++++++- pandas/_libs/tslibs/timestamps.pyi | 4 +- pandas/_libs/tslibs/timestamps.pyx | 17 ++++++- pandas/core/arrays/datetimelike.py | 20 ++++---- pandas/core/arrays/datetimes.py | 10 ++-- pandas/core/arrays/timedeltas.py | 6 +-- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/dtypes.py | 4 +- pandas/core/window/ewm.py | 2 +- pandas/tests/arrays/test_datetimes.py | 20 ++++---- pandas/tests/arrays/test_timedeltas.py | 2 +- .../tests/indexes/datetimes/test_indexing.py | 4 +- .../tests/indexes/timedeltas/test_indexing.py | 4 +- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/scalar/test_nat.py | 4 +- .../scalar/timedelta/test_constructors.py | 4 +- .../tests/scalar/timedelta/test_timedelta.py | 30 ++++++------ .../tests/scalar/timestamp/test_timestamp.py | 46 +++++++++---------- .../tests/scalar/timestamp/test_timezones.py | 8 ++-- .../tests/scalar/timestamp/test_unary_ops.py | 14 +++--- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 4 +- 27 files changed, 156 insertions(+), 103 deletions(-) diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 33a611b15675d..5b41de4e12e6f 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -139,6 +139,7 @@ Properties Timestamp.second Timestamp.tz Timestamp.tzinfo + Timestamp.unit Timestamp.value Timestamp.week Timestamp.weekofyear @@ -149,6 +150,7 @@ Methods .. autosummary:: :toctree: api/ + Timestamp.as_unit Timestamp.astimezone Timestamp.ceil Timestamp.combine @@ -242,6 +244,7 @@ Properties Timedelta.nanoseconds Timedelta.resolution Timedelta.seconds + Timedelta.unit Timedelta.value Timedelta.view @@ -250,6 +253,7 @@ Methods .. autosummary:: :toctree: api/ + Timedelta.as_unit Timedelta.ceil Timedelta.floor Timedelta.isoformat diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6d6e90673f030..d7c4c022a2556 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -551,7 +551,7 @@ cpdef array_to_datetime( raise ValueError('Cannot mix tz-aware with ' 'tz-naive values') if isinstance(val, _Timestamp): - iresult[i] = val._as_unit("ns").value + iresult[i] = val.as_unit("ns").value else: iresult[i] = pydatetime_to_dt64(val, &dts) check_dts_bounds(&dts) @@ -906,7 +906,7 @@ def array_to_datetime_with_tz(ndarray values, tzinfo tz): else: # datetime64, tznaive pydatetime, int, float ts = ts.tz_localize(tz) - ts = ts._as_unit("ns") + ts = ts.as_unit("ns") ival = ts.value # Analogous to: result[i] = ival diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index e9ae46cee7aec..72f55bb50895a 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -127,3 +127,4 @@ class NaTType: __le__: _NatComparison __gt__: _NatComparison __ge__: _NatComparison + def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index e9fb40bbcdf85..dcb7358d8e69a 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1195,6 +1195,22 @@ default 'raise' def tzinfo(self) -> None: return None + def as_unit(self, str unit, bint round_ok=True) -> "NaTType": + """ + Convert the underlying int64 representaton to the given unit. + + Parameters + ---------- + unit : {"ns", "us", "ms", "s"} + round_ok : bool, default True + If False and the conversion requires rounding, raise. + + Returns + ------- + Timestamp + """ + return c_NaT + c_NaT = NaTType() # C-visible NaT = c_NaT # Python-visible diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index ef3cd6df167f4..f41bea11985f2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -152,5 +152,5 @@ class Timedelta(timedelta): def to_numpy(self) -> np.timedelta64: ... def view(self, dtype: npt.DTypeLike = ...) -> object: ... @property - def _unit(self) -> str: ... - def _as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... + def unit(self) -> str: ... + def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 83b4f34bfb70b..071cfb7cf541a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -339,7 +339,7 @@ cdef convert_to_timedelta64(object ts, str unit): elif isinstance(ts, _Timedelta): # already in the proper format if ts._creso != NPY_FR_ns: - ts = ts._as_unit("ns").asm8 + ts = ts.as_unit("ns").asm8 else: ts = np.timedelta64(ts.value, "ns") elif is_timedelta64_object(ts): @@ -1081,6 +1081,10 @@ cdef class _Timedelta(timedelta): # TODO: add nanos/1e9? return self.days * 24 * 3600 + self.seconds + self.microseconds / 1_000_000 + @property + def unit(self) -> str: + return npy_unit_to_abbrev(self._creso) + def __hash__(_Timedelta self): if self._has_ns(): # Note: this does *not* satisfy the invariance @@ -1500,7 +1504,20 @@ cdef class _Timedelta(timedelta): # exposing as classmethod for testing return _timedelta_from_value_and_reso(value, reso) - def _as_unit(self, str unit, bint round_ok=True): + def as_unit(self, str unit, bint round_ok=True): + """ + Convert the underlying int64 representaton to the given unit. + + Parameters + ---------- + unit : {"ns", "us", "ms", "s"} + round_ok : bool, default True + If False and the conversion requires rounding, raise. + + Returns + ------- + Timedelta + """ dtype = np.dtype(f"m8[{unit}]") reso = get_unit_from_dtype(dtype) return self._as_creso(reso, round_ok=round_ok) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 77f02741aae48..3f795a2d08959 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -220,5 +220,5 @@ class Timestamp(datetime): @property def daysinmonth(self) -> int: ... @property - def _unit(self) -> str: ... - def _as_unit(self, unit: str, round_ok: bool = ...) -> Timestamp: ... + def unit(self) -> str: ... + def as_unit(self, unit: str, round_ok: bool = ...) -> Timestamp: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ac8a6738a816c..b0208f9ca3296 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -233,7 +233,7 @@ cdef class _Timestamp(ABCTimestamp): resolution = MinMaxReso("resolution") # GH#21336, GH#21365 @property - def _unit(self) -> str: + def unit(self) -> str: """ The abbreviation associated with self._creso. """ @@ -993,7 +993,20 @@ cdef class _Timestamp(ABCTimestamp): value = convert_reso(self.value, self._creso, reso, round_ok=round_ok) return type(self)._from_value_and_reso(value, reso=reso, tz=self.tzinfo) - def _as_unit(self, str unit, bint round_ok=True): + def as_unit(self, str unit, bint round_ok=True): + """ + Convert the underlying int64 representaton to the given unit. + + Parameters + ---------- + unit : {"ns", "us", "ms", "s"} + round_ok : bool, default True + If False and the conversion requires rounding, raise. + + Returns + ------- + Timestamp + """ dtype = np.dtype(f"M8[{unit}]") reso = get_unit_from_dtype(dtype) try: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b1d9fba22b484..b36eeab70726e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -816,7 +816,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if self.dtype.kind in ["m", "M"]: self = cast("DatetimeArray | TimedeltaArray", self) - values = values._as_unit(self._unit) + values = values.as_unit(self.unit) try: self._check_compatible_with(values) @@ -1116,7 +1116,7 @@ def _add_datetimelike_scalar(self, other) -> DatetimeArray: # i.e. np.datetime64("NaT") # In this case we specifically interpret NaT as a datetime, not # the timedelta interpretation we would get by returning self + NaT - result = self._ndarray + NaT.to_datetime64().astype(f"M8[{self._unit}]") + result = self._ndarray + NaT.to_datetime64().astype(f"M8[{self.unit}]") # Preserve our resolution return DatetimeArray._simple_new(result, dtype=result.dtype) @@ -1128,10 +1128,10 @@ def _add_datetimelike_scalar(self, other) -> DatetimeArray: result = checked_add_with_arr( self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask ) - res_values = result.view(f"M8[{self._unit}]") + res_values = result.view(f"M8[{self.unit}]") - dtype = tz_to_dtype(tz=other.tz, unit=self._unit) - res_values = result.view(f"M8[{self._unit}]") + dtype = tz_to_dtype(tz=other.tz, unit=self.unit) + res_values = result.view(f"M8[{self.unit}]") new_freq = self._get_arithmetic_result_freq(other) return DatetimeArray._simple_new(res_values, dtype=dtype, freq=new_freq) @@ -1191,7 +1191,7 @@ def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray: res_values = checked_add_with_arr( self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask ) - res_m8 = res_values.view(f"timedelta64[{self._unit}]") + res_m8 = res_values.view(f"timedelta64[{self.unit}]") new_freq = self._get_arithmetic_result_freq(other) return TimedeltaArray._simple_new(res_m8, dtype=res_m8.dtype, freq=new_freq) @@ -1989,13 +1989,13 @@ def _creso(self) -> int: return get_unit_from_dtype(self._ndarray.dtype) @cache_readonly - def _unit(self) -> str: + def unit(self) -> str: # e.g. "ns", "us", "ms" # error: Argument 1 to "dtype_to_unit" has incompatible type # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]" return dtype_to_unit(self.dtype) # type: ignore[arg-type] - def _as_unit(self: TimelikeOpsT, unit: str) -> TimelikeOpsT: + def as_unit(self: TimelikeOpsT, unit: str) -> TimelikeOpsT: dtype = np.dtype(f"{self.dtype.kind}8[{unit}]") new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True) @@ -2017,9 +2017,9 @@ def _ensure_matching_resos(self, other): if self._creso != other._creso: # Just as with Timestamp/Timedelta, we cast to the higher resolution if self._creso < other._creso: - self = self._as_unit(other._unit) + self = self.as_unit(other.unit) else: - other = other._as_unit(self._unit) + other = other.as_unit(self.unit) return self, other # -------------------------------------------------------------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6d31d0086d84b..d0a932ec378b9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -351,9 +351,9 @@ def _from_sequence_not_strict( data_unit = np.datetime_data(subarr.dtype)[0] data_dtype = tz_to_dtype(tz, data_unit) result = cls._simple_new(subarr, freq=freq, dtype=data_dtype) - if unit is not None and unit != result._unit: + if unit is not None and unit != result.unit: # If unit was specified in user-passed dtype, cast to it here - result = result._as_unit(unit) + result = result.as_unit(unit) if inferred_freq is None and freq is not None: # this condition precludes `freq_infer` @@ -843,7 +843,7 @@ def tz_convert(self, tz) -> DatetimeArray: ) # No conversion since timestamps are all UTC to begin with - dtype = tz_to_dtype(tz, unit=self._unit) + dtype = tz_to_dtype(tz, unit=self.unit) return self._simple_new(self._ndarray, dtype=dtype, freq=self.freq) @dtl.ravel_compat @@ -1018,8 +1018,8 @@ def tz_localize( nonexistent=nonexistent, creso=self._creso, ) - new_dates = new_dates.view(f"M8[{self._unit}]") - dtype = tz_to_dtype(tz, unit=self._unit) + new_dates = new_dates.view(f"M8[{self.unit}]") + dtype = tz_to_dtype(tz, unit=self.unit) freq = None if timezones.is_utc(tz) or (len(self) == 1 and not isna(new_dates[0])): diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 65996b1df5e9a..fe7cade1711d0 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -268,10 +268,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): ) if start is not None: - start = Timedelta(start)._as_unit("ns") + start = Timedelta(start).as_unit("ns") if end is not None: - end = Timedelta(end)._as_unit("ns") + end = Timedelta(end).as_unit("ns") left_closed, right_closed = validate_endpoints(closed) @@ -298,7 +298,7 @@ def _unbox_scalar(self, value) -> np.timedelta64: if value is NaT: return np.timedelta64(value.value, "ns") else: - return value._as_unit(self._unit).asm8 + return value.as_unit(self.unit).asm8 def _scalar_from_string(self, value) -> Timedelta | NaTType: return Timedelta(value) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e9d3721bbb5f5..6a366fa53e957 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -931,7 +931,7 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: else: return False return tipo == DT64NS_DTYPE or ( - isinstance(tipo, DatetimeTZDtype) and tipo._unit == "ns" + isinstance(tipo, DatetimeTZDtype) and tipo.unit == "ns" ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e46e081c57d8a..15fcfe80b1915 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -676,7 +676,7 @@ def na_value(self) -> NaTType: # error: Signature of "str" incompatible with supertype "PandasExtensionDtype" @cache_readonly def str(self) -> str: # type: ignore[override] - return f"|M8[{self._unit}]" + return f"|M8[{self.unit}]" def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None: if isinstance(unit, DatetimeTZDtype): @@ -720,7 +720,7 @@ def _creso(self) -> int: "ms": dtypes.NpyDatetimeUnit.NPY_FR_ms, "us": dtypes.NpyDatetimeUnit.NPY_FR_us, "ns": dtypes.NpyDatetimeUnit.NPY_FR_ns, - }[self._unit] + }[self.unit] return reso.value @property diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c5c401d415ad0..b53e9fc3c55ec 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -123,7 +123,7 @@ def _calculate_deltas( """ _times = np.asarray(times.view(np.int64), dtype=np.float64) # TODO: generalize to non-nano? - _halflife = float(Timedelta(halflife)._as_unit("ns").value) + _halflife = float(Timedelta(halflife).as_unit("ns").value) return np.diff(_times) / _halflife diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 564194ed4a9d3..166362a9a8c30 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -215,12 +215,12 @@ def test_add_mismatched_reso_doesnt_downcast(self): # https://github.com/pandas-dev/pandas/pull/48748#issuecomment-1260181008 td = pd.Timedelta(microseconds=1) dti = pd.date_range("2016-01-01", periods=3) - td - dta = dti._data._as_unit("us") + dta = dti._data.as_unit("us") - res = dta + td._as_unit("us") + res = dta + td.as_unit("us") # even though the result is an even number of days # (so we _could_ downcast to unit="s"), we do not. - assert res._unit == "us" + assert res.unit == "us" @pytest.mark.parametrize( "scalar", @@ -240,32 +240,32 @@ def test_add_timedeltalike_scalar_mismatched_reso(self, dta_dti, scalar): exp_reso = max(dta._creso, td._creso) exp_unit = npy_unit_to_abbrev(exp_reso) - expected = (dti + td)._data._as_unit(exp_unit) + expected = (dti + td)._data.as_unit(exp_unit) result = dta + scalar tm.assert_extension_array_equal(result, expected) result = scalar + dta tm.assert_extension_array_equal(result, expected) - expected = (dti - td)._data._as_unit(exp_unit) + expected = (dti - td)._data.as_unit(exp_unit) result = dta - scalar tm.assert_extension_array_equal(result, expected) def test_sub_datetimelike_scalar_mismatch(self): dti = pd.date_range("2016-01-01", periods=3) - dta = dti._data._as_unit("us") + dta = dti._data.as_unit("us") - ts = dta[0]._as_unit("s") + ts = dta[0].as_unit("s") result = dta - ts - expected = (dti - dti[0])._data._as_unit("us") + expected = (dti - dti[0])._data.as_unit("us") assert result.dtype == "m8[us]" tm.assert_extension_array_equal(result, expected) def test_sub_datetime64_reso_mismatch(self): dti = pd.date_range("2016-01-01", periods=3) - left = dti._data._as_unit("s") - right = left._as_unit("ms") + left = dti._data.as_unit("s") + right = left.as_unit("ms") result = left - right exp_values = np.array([0, 0, 0], dtype="m8[ms]") diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index f5d50465fee10..2fd7ccc9cf338 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -104,7 +104,7 @@ def test_add_pdnat(self, tda): def test_add_datetimelike_scalar(self, tda, tz_naive_fixture): ts = pd.Timestamp("2016-01-01", tz=tz_naive_fixture) - expected = tda._as_unit("ns") + ts + expected = tda.as_unit("ns") + ts res = tda + ts tm.assert_extension_array_equal(res, expected) res = ts + tda diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 1e7cc86616afc..887766dd3fc29 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -388,7 +388,7 @@ def test_take_fill_value_with_timezone(self): class TestGetLoc: def test_get_loc_key_unit_mismatch(self): idx = date_range("2000-01-01", periods=3) - key = idx[1]._as_unit("ms") + key = idx[1].as_unit("ms") loc = idx.get_loc(key) assert loc == 1 assert key in idx @@ -396,7 +396,7 @@ def test_get_loc_key_unit_mismatch(self): def test_get_loc_key_unit_mismatch_not_castable(self): dta = date_range("2000-01-01", periods=3)._data.astype("M8[s]") dti = DatetimeIndex(dta) - key = dta[0]._as_unit("ns") + pd.Timedelta(1) + key = dta[0].as_unit("ns") + pd.Timedelta(1) with pytest.raises( KeyError, match=r"Timestamp\('2000-01-01 00:00:00.000000001'\)" diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 4b7140b112bd9..12aece23738ec 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -77,14 +77,14 @@ def test_timestamp_invalid_key(self, key): class TestGetLoc: def test_get_loc_key_unit_mismatch(self): idx = to_timedelta(["0 days", "1 days", "2 days"]) - key = idx[1]._as_unit("ms") + key = idx[1].as_unit("ms") loc = idx.get_loc(key) assert loc == 1 def test_get_loc_key_unit_mismatch_not_castable(self): tdi = to_timedelta(["0 days", "1 days", "2 days"]).astype("m8[s]") assert tdi.dtype == "m8[s]" - key = tdi[0]._as_unit("ns") + Timedelta(1) + key = tdi[0].as_unit("ns") + Timedelta(1) with pytest.raises(KeyError, match=r"Timedelta\('0 days 00:00:00.000000001'\)"): tdi.get_loc(key) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 749075b8637cf..2f3fc4d0fcba8 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -970,7 +970,7 @@ def test_mixed_timedelta_datetime(self): ts = Timestamp("20130101") frame = DataFrame({"a": [td, ts]}, dtype=object) - expected = DataFrame({"a": [pd.Timedelta(td)._as_unit("ns").value, ts.value]}) + expected = DataFrame({"a": [pd.Timedelta(td).as_unit("ns").value, ts.value]}) result = read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"}) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 7ecca562d4996..e310506935729 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -184,7 +184,7 @@ def test_nat_iso_format(get_nat): @pytest.mark.parametrize( "klass,expected", [ - (Timestamp, ["normalize", "to_julian_date", "to_period"]), + (Timestamp, ["normalize", "to_julian_date", "to_period", "unit"]), ( Timedelta, [ @@ -192,6 +192,7 @@ def test_nat_iso_format(get_nat): "resolution_string", "to_pytimedelta", "to_timedelta64", + "unit", "view", ], ), @@ -254,6 +255,7 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): ( Timestamp, [ + "as_unit", "astimezone", "ceil", "combine", diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index 7540813fd302b..dd671e3c9e094 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -313,7 +313,7 @@ def test_construction_out_of_bounds_td64ns(val, unit): assert td.asm8.dtype == "m8[s]" msg = r"Cannot cast 1067\d\d days .* to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=msg): - td._as_unit("ns") + td.as_unit("ns") # But just back in bounds and we are OK assert Timedelta(td64 - 1) == td64 - 1 @@ -324,7 +324,7 @@ def test_construction_out_of_bounds_td64ns(val, unit): td2 = Timedelta(td64) msg = r"Cannot cast -1067\d\d days .* to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=msg): - td2._as_unit("ns") + td2.as_unit("ns") # But just back in bounds and we are OK assert Timedelta(td64 + 1) == td64 + 1 diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 68b4eea28e367..924f756edb233 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -29,29 +29,29 @@ class TestAsUnit: def test_as_unit(self): td = Timedelta(days=1) - assert td._as_unit("ns") is td + assert td.as_unit("ns") is td - res = td._as_unit("us") + res = td.as_unit("us") assert res.value == td.value // 1000 assert res._creso == NpyDatetimeUnit.NPY_FR_us.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == td.value assert rt._creso == td._creso - res = td._as_unit("ms") + res = td.as_unit("ms") assert res.value == td.value // 1_000_000 assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == td.value assert rt._creso == td._creso - res = td._as_unit("s") + res = td.as_unit("s") assert res.value == td.value // 1_000_000_000 assert res._creso == NpyDatetimeUnit.NPY_FR_s.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == td.value assert rt._creso == td._creso @@ -62,15 +62,15 @@ def test_as_unit_overflows(self): msg = "Cannot cast 106752 days 00:00:00 to unit='ns' without overflow" with pytest.raises(OutOfBoundsTimedelta, match=msg): - td._as_unit("ns") + td.as_unit("ns") - res = td._as_unit("ms") + res = td.as_unit("ms") assert res.value == us // 1000 assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value def test_as_unit_rounding(self): td = Timedelta(microseconds=1500) - res = td._as_unit("ms") + res = td.as_unit("ms") expected = Timedelta(milliseconds=1) assert res == expected @@ -79,18 +79,18 @@ def test_as_unit_rounding(self): assert res.value == 1 with pytest.raises(ValueError, match="Cannot losslessly convert units"): - td._as_unit("ms", round_ok=False) + td.as_unit("ms", round_ok=False) def test_as_unit_non_nano(self): # case where we are going neither to nor from nano - td = Timedelta(days=1)._as_unit("ms") + td = Timedelta(days=1).as_unit("ms") assert td.days == 1 assert td.value == 86_400_000 assert td.components.days == 1 assert td._d == 1 assert td.total_seconds() == 86400 - res = td._as_unit("us") + res = td.as_unit("us") assert res.value == 86_400_000_000 assert res.components.days == 1 assert res.components.hours == 0 @@ -260,7 +260,7 @@ def test_floordiv_numeric(self, td): def test_addsub_mismatched_reso(self, td): # need to cast to since td is out of bounds for ns, so # so we would raise OverflowError without casting - other = Timedelta(days=1)._as_unit("us") + other = Timedelta(days=1).as_unit("us") # td is out of bounds for ns result = td + other @@ -754,7 +754,7 @@ def test_round_sanity(self, val, method): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_round_non_nano(self, unit): - td = Timedelta("1 days 02:34:57")._as_unit(unit) + td = Timedelta("1 days 02:34:57").as_unit(unit) res = td.round("min") assert res == Timedelta("1 days 02:35:00") diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 2d9deff13322b..f5b9a35a53a24 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -756,7 +756,7 @@ def test_cmp_cross_reso(self): # subtracting 3600*24 gives a datetime64 that _can_ fit inside the # nanosecond implementation bounds. - other = Timestamp(dt64 - 3600 * 24)._as_unit("ns") + other = Timestamp(dt64 - 3600 * 24).as_unit("ns") assert other < ts assert other.asm8 > ts.asm8 # <- numpy gets this wrong assert ts > other @@ -870,7 +870,7 @@ def test_sub_datetimelike_mismatched_reso(self, ts_tz): NpyDatetimeUnit.NPY_FR_ms.value: "s", NpyDatetimeUnit.NPY_FR_s.value: "us", }[ts._creso] - other = ts._as_unit(unit) + other = ts.as_unit(unit) assert other._creso != ts._creso result = ts - other @@ -886,7 +886,7 @@ def test_sub_datetimelike_mismatched_reso(self, ts_tz): if ts._creso < other._creso: # Case where rounding is lossy other2 = other + Timedelta._from_value_and_reso(1, other._creso) - exp = ts._as_unit(other._unit) - other2 + exp = ts.as_unit(other.unit) - other2 res = ts - other2 assert res == exp @@ -897,7 +897,7 @@ def test_sub_datetimelike_mismatched_reso(self, ts_tz): assert res._creso == max(ts._creso, other._creso) else: ts2 = ts + Timedelta._from_value_and_reso(1, ts._creso) - exp = ts2 - other._as_unit(ts2._unit) + exp = ts2 - other.as_unit(ts2.unit) res = ts2 - other assert res == exp @@ -918,7 +918,7 @@ def test_sub_timedeltalike_mismatched_reso(self, ts_tz): NpyDatetimeUnit.NPY_FR_ms.value: "s", NpyDatetimeUnit.NPY_FR_s.value: "us", }[ts._creso] - other = Timedelta(0)._as_unit(unit) + other = Timedelta(0).as_unit(unit) assert other._creso != ts._creso result = ts + other @@ -934,7 +934,7 @@ def test_sub_timedeltalike_mismatched_reso(self, ts_tz): if ts._creso < other._creso: # Case where rounding is lossy other2 = other + Timedelta._from_value_and_reso(1, other._creso) - exp = ts._as_unit(other._unit) + other2 + exp = ts.as_unit(other.unit) + other2 res = ts + other2 assert res == exp assert res._creso == max(ts._creso, other._creso) @@ -943,7 +943,7 @@ def test_sub_timedeltalike_mismatched_reso(self, ts_tz): assert res._creso == max(ts._creso, other._creso) else: ts2 = ts + Timedelta._from_value_and_reso(1, ts._creso) - exp = ts2 + other._as_unit(ts2._unit) + exp = ts2 + other.as_unit(ts2.unit) res = ts2 + other assert res == exp @@ -954,8 +954,8 @@ def test_sub_timedeltalike_mismatched_reso(self, ts_tz): def test_addition_doesnt_downcast_reso(self): # https://github.com/pandas-dev/pandas/pull/48748#pullrequestreview-1122635413 - ts = Timestamp(year=2022, month=1, day=1, microsecond=999999)._as_unit("us") - td = Timedelta(microseconds=1)._as_unit("us") + ts = Timestamp(year=2022, month=1, day=1, microsecond=999999).as_unit("us") + td = Timedelta(microseconds=1).as_unit("us") res = ts + td assert res._creso == ts._creso @@ -963,7 +963,7 @@ def test_sub_timedelta64_mismatched_reso(self, ts_tz): ts = ts_tz res = ts + np.timedelta64(1, "ns") - exp = ts._as_unit("ns") + np.timedelta64(1, "ns") + exp = ts.as_unit("ns") + np.timedelta64(1, "ns") assert exp == res assert exp._creso == NpyDatetimeUnit.NPY_FR_ns.value @@ -1001,29 +1001,29 @@ class TestAsUnit: def test_as_unit(self): ts = Timestamp("1970-01-01") - assert ts._as_unit("ns") is ts + assert ts.as_unit("ns") is ts - res = ts._as_unit("us") + res = ts.as_unit("us") assert res.value == ts.value // 1000 assert res._creso == NpyDatetimeUnit.NPY_FR_us.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == ts.value assert rt._creso == ts._creso - res = ts._as_unit("ms") + res = ts.as_unit("ms") assert res.value == ts.value // 1_000_000 assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == ts.value assert rt._creso == ts._creso - res = ts._as_unit("s") + res = ts.as_unit("s") assert res.value == ts.value // 1_000_000_000 assert res._creso == NpyDatetimeUnit.NPY_FR_s.value - rt = res._as_unit("ns") + rt = res.as_unit("ns") assert rt.value == ts.value assert rt._creso == ts._creso @@ -1034,15 +1034,15 @@ def test_as_unit_overflows(self): msg = "Cannot cast 2262-04-12 00:00:00 to unit='ns' without overflow" with pytest.raises(OutOfBoundsDatetime, match=msg): - ts._as_unit("ns") + ts.as_unit("ns") - res = ts._as_unit("ms") + res = ts.as_unit("ms") assert res.value == us // 1000 assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value def test_as_unit_rounding(self): ts = Timestamp(1_500_000) # i.e. 1500 microseconds - res = ts._as_unit("ms") + res = ts.as_unit("ms") expected = Timestamp(1_000_000) # i.e. 1 millisecond assert res == expected @@ -1051,17 +1051,17 @@ def test_as_unit_rounding(self): assert res.value == 1 with pytest.raises(ValueError, match="Cannot losslessly convert units"): - ts._as_unit("ms", round_ok=False) + ts.as_unit("ms", round_ok=False) def test_as_unit_non_nano(self): # case where we are going neither to nor from nano - ts = Timestamp("1970-01-02")._as_unit("ms") + ts = Timestamp("1970-01-02").as_unit("ms") assert ts.year == 1970 assert ts.month == 1 assert ts.day == 2 assert ts.hour == ts.minute == ts.second == ts.microsecond == ts.nanosecond == 0 - res = ts._as_unit("s") + res = ts.as_unit("s") assert res.value == 24 * 3600 assert res.year == 1970 assert res.month == 1 diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index a05da73ac3031..912b7d9232abe 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -62,7 +62,7 @@ def test_tz_localize_pushes_out_of_bounds(self): def test_tz_localize_ambiguous_bool(self, unit): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 - ts = Timestamp("2015-11-01 01:00:03")._as_unit(unit) + ts = Timestamp("2015-11-01 01:00:03").as_unit(unit) expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") @@ -257,7 +257,7 @@ def test_timestamp_tz_localize_nonexistent_shift( tz = tz_type + tz if isinstance(shift, str): shift = "shift_" + shift - ts = Timestamp(start_ts)._as_unit(unit) + ts = Timestamp(start_ts).as_unit(unit) result = ts.tz_localize(tz, nonexistent=shift) expected = Timestamp(end_ts).tz_localize(tz) @@ -286,7 +286,7 @@ def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_timestamp_tz_localize_nonexistent_NaT(self, tz, unit): # GH 8917 - ts = Timestamp("2015-03-29 02:20:00")._as_unit(unit) + ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) result = ts.tz_localize(tz, nonexistent="NaT") assert result is NaT @@ -294,7 +294,7 @@ def test_timestamp_tz_localize_nonexistent_NaT(self, tz, unit): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_timestamp_tz_localize_nonexistent_raise(self, tz, unit): # GH 8917 - ts = Timestamp("2015-03-29 02:20:00")._as_unit(unit) + ts = Timestamp("2015-03-29 02:20:00").as_unit(unit) msg = "2015-03-29 02:20:00" with pytest.raises(pytz.NonExistentTimeError, match=msg): ts.tz_localize(tz, nonexistent="raise") diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 6d9cfa51d2210..1c1f3acc8331f 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -150,7 +150,7 @@ def test_round_minute_freq(self, test_input, freq, expected, rounder): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_ceil(self, unit): - dt = Timestamp("20130101 09:10:11")._as_unit(unit) + dt = Timestamp("20130101 09:10:11").as_unit(unit) result = dt.ceil("D") expected = Timestamp("20130102") assert result == expected @@ -158,7 +158,7 @@ def test_ceil(self, unit): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_floor(self, unit): - dt = Timestamp("20130101 09:10:11")._as_unit(unit) + dt = Timestamp("20130101 09:10:11").as_unit(unit) result = dt.floor("D") expected = Timestamp("20130101") assert result == expected @@ -172,7 +172,7 @@ def test_floor(self, unit): def test_round_dst_border_ambiguous(self, method, unit): # GH 18946 round near "fall back" DST ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") - ts = ts._as_unit(unit) + ts = ts.as_unit(unit) # result = getattr(ts, method)("H", ambiguous=True) assert result == ts @@ -206,7 +206,7 @@ def test_round_dst_border_ambiguous(self, method, unit): ) def test_round_dst_border_nonexistent(self, method, ts_str, freq, unit): # GH 23324 round near "spring forward" DST - ts = Timestamp(ts_str, tz="America/Chicago")._as_unit(unit) + ts = Timestamp(ts_str, tz="America/Chicago").as_unit(unit) result = getattr(ts, method)(freq, nonexistent="shift_forward") expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") assert result == expected @@ -486,7 +486,7 @@ def test_replace_across_dst(self, tz, normalize): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_replace_dst_border(self, unit): # Gh 7825 - t = Timestamp("2013-11-3", tz="America/Chicago")._as_unit(unit) + t = Timestamp("2013-11-3", tz="America/Chicago").as_unit(unit) result = t.replace(hour=3) expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected @@ -498,7 +498,7 @@ def test_replace_dst_border(self, unit): def test_replace_dst_fold(self, fold, tz, unit): # GH 25017 d = datetime(2019, 10, 27, 2, 30) - ts = Timestamp(d, tz=tz)._as_unit(unit) + ts = Timestamp(d, tz=tz).as_unit(unit) result = ts.replace(hour=1, fold=fold) expected = Timestamp(datetime(2019, 10, 27, 1, 30)).tz_localize( tz, ambiguous=not fold @@ -513,7 +513,7 @@ def test_replace_dst_fold(self, fold, tz, unit): @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) def test_normalize(self, tz_naive_fixture, arg, unit): tz = tz_naive_fixture - ts = Timestamp(arg, tz=tz)._as_unit(unit) + ts = Timestamp(arg, tz=tz).as_unit(unit) result = ts.normalize() expected = Timestamp("2013-11-30", tz=tz) assert result == expected diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 37348bb743537..d69809f42b5a1 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1582,7 +1582,7 @@ def test_convert_non_ns(self): assert ser.dtype == arr.dtype tdi = timedelta_range("00:00:01", periods=3, freq="s") - tda = tdi._data._as_unit("s") + tda = tdi._data.as_unit("s") expected = Series(tda) assert expected.dtype == arr.dtype tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 29b82f27234a5..63594c2b2c48a 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -583,8 +583,8 @@ def test_add_dt64_ndarray_non_nano(self, offset_types, unit, request): exp_unit = unit if isinstance(off, Tick) and off._creso > dta._creso: # cast to higher reso like we would with Timedelta scalar - exp_unit = Timedelta(off)._unit - expected = expected._as_unit(exp_unit) + exp_unit = Timedelta(off).unit + expected = expected.as_unit(exp_unit) if len(w): # PerformanceWarning was issued bc _apply_array raised, so we From 678b46aaca90fd1ad25ea7123fa5bc8ea7b077bb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Nov 2022 13:59:11 -0800 Subject: [PATCH 29/39] DEPR: Enforce Series(float_with_nan, dtype=inty) (#49605) * DEPR: Enforce Series(float_with_nan, dtype=inty) * update asv * troubleshoot asv * suggested asv edit --- asv_bench/benchmarks/groupby.py | 36 +++++++++-------- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/construction.py | 13 +----- pandas/tests/frame/test_constructors.py | 50 ++++++++++-------------- pandas/tests/series/test_constructors.py | 37 +++++++++--------- pandas/tests/test_downstream.py | 13 +++--- 6 files changed, 68 insertions(+), 82 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6dff4a017e2a9..6f0bb3091133f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -600,31 +600,35 @@ def time_frame_agg(self, dtype, method): class Cumulative: - param_names = ["dtype", "method"] + param_names = ["dtype", "method", "with_nans"] params = [ ["float64", "int64", "Float64", "Int64"], ["cummin", "cummax", "cumsum"], + [True, False], ] - def setup(self, dtype, method): + def setup(self, dtype, method, with_nans): + if with_nans and dtype == "int64": + raise NotImplementedError("Construction of df would raise") + N = 500_000 - vals = np.random.randint(-10, 10, (N, 5)) - null_vals = vals.astype(float, copy=True) - null_vals[::2, :] = np.nan - null_vals[::3, :] = np.nan - df = DataFrame(vals, columns=list("abcde"), dtype=dtype) - null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) keys = np.random.randint(0, 100, size=N) - df["key"] = keys - null_df["key"] = keys - self.df = df - self.null_df = null_df + vals = np.random.randint(-10, 10, (N, 5)) - def time_frame_transform(self, dtype, method): - self.df.groupby("key").transform(method) + if with_nans: + null_vals = vals.astype(float, copy=True) + null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan + df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + df["key"] = keys + self.df = df + else: + df = DataFrame(vals, columns=list("abcde")).astype(dtype, copy=False) + df["key"] = keys + self.df = df - def time_frame_transform_many_nulls(self, dtype, method): - self.null_df.groupby("key").transform(method) + def time_frame_transform(self, dtype, method, with_nans): + self.df.groupby("key").transform(method) class RankWithTies: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 151d853166563..f4a6a6277b6a1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -492,6 +492,7 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) - Changed behavior of setitem-like operations (``__setitem__``, ``fillna``, ``where``, ``mask``, ``replace``, ``insert``, fill_value for ``shift``) on an object with :class:`DatetimeTZDtype` when using a value with a non-matching timezone, the value will be cast to the object's timezone instead of casting both to object-dtype (:issue:`44243`) - Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`) +- Changed behavior of :class:`Series` and :class:`DataFrame` constructors with integer dtype and floating-point data containing ``NaN``, this now raises ``IntCastingNaNError`` (:issue:`40110`) - Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`) - Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`) - Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3ae509e74074e..d40c334ab1840 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -14,7 +14,6 @@ cast, overload, ) -import warnings import numpy as np from numpy import ma @@ -29,7 +28,6 @@ T, ) from pandas.errors import IntCastingNaNError -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -577,16 +575,7 @@ def sanitize_array( subarr = maybe_cast_to_integer_array(data, dtype) except IntCastingNaNError: - warnings.warn( - "In a future version, passing float-dtype values containing NaN " - "and an integer dtype will raise IntCastingNaNError " - "(subclass of ValueError) instead of silently ignoring the " - "passed dtype. To retain the old behavior, call Series(arr) or " - "DataFrame(arr) without passing a dtype.", - FutureWarning, - stacklevel=find_stack_level(), - ) - subarr = np.array(data, copy=copy) + raise except ValueError: # Pre-2.0, we would have different behavior for Series vs DataFrame. # DataFrame would call np.array(data, dtype=dtype, copy=copy), diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 810b7f6eaf2a6..2952db7febea1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytest import pytz +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -105,16 +106,13 @@ def test_constructor_dict_with_tzaware_scalar(self): def test_construct_ndarray_with_nas_and_int_dtype(self): # GH#26919 match Series by not casting np.nan to meaningless int arr = np.array([[1, np.nan], [2, 3]]) - with tm.assert_produces_warning(FutureWarning): - df = DataFrame(arr, dtype="i8") - assert df.values.dtype == arr.dtype - assert isna(df.iloc[0, 1]) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame(arr, dtype="i8") # check this matches Series behavior - with tm.assert_produces_warning(FutureWarning): - ser = Series(arr[0], dtype="i8", name=0) - expected = df.iloc[0] - tm.assert_series_equal(ser, expected) + with pytest.raises(IntCastingNaNError, match=msg): + Series(arr[0], dtype="i8", name=0) def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) @@ -966,21 +964,16 @@ def _check_basic_constructor(self, empty): assert len(frame.index) == 3 assert len(frame.columns) == 1 - warn = None if empty is np.ones else FutureWarning - with tm.assert_produces_warning(warn): + if empty is not np.ones: + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + return + else: frame = DataFrame( mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64 ) - if empty is np.ones: - # passing dtype casts assert frame.values.dtype == np.int64 - else: - # i.e. ma.masked_all - # Since we have NaNs, refuse to cast to int dtype, which would take NaN - # to meaningless integers. This matches Series behavior. GH#26919 - assert frame.isna().all().all() - assert frame.values.dtype == np.float64 - assert isna(frame.values).all() # wrong size axis labels msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" @@ -1741,11 +1734,10 @@ def test_constructor_mix_series_nonseries(self, float_frame): DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]}) def test_constructor_miscast_na_int_dtype(self): - msg = "float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) - expected = DataFrame([[np.nan, 1], [1, 0]]) - tm.assert_frame_equal(df, expected) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) def test_constructor_column_duplicates(self): # it works! #2079 @@ -2722,16 +2714,16 @@ def test_floating_values_integer_dtype(self): # with NaNs, we go through a different path with a different warning arr[0, 0] = np.nan - msg = "passing float-dtype values containing NaN" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): DataFrame(arr, dtype="i8") - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): Series(arr[0], dtype="i8") # The future (raising) behavior matches what we would get via astype: msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(ValueError, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): DataFrame(arr).astype("i8") - with pytest.raises(ValueError, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): Series(arr[0]).astype("i8") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d69809f42b5a1..826ad20dfd54e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -15,6 +15,7 @@ lib, ) from pandas.compat import is_numpy_dev +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -670,10 +671,9 @@ def test_constructor_sanitize(self): s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") assert s.dtype == np.dtype("i8") - msg = "float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") - assert ser.dtype == np.dtype("f8") + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") def test_constructor_copy(self): # GH15125 @@ -809,18 +809,17 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series): res = frame_or_series(list(arr), dtype="i8") tm.assert_equal(res, expected) - # When we have NaNs, we silently ignore the integer dtype + # pre-2.0, when we had NaNs, we silently ignored the integer dtype arr[0] = np.nan expected = frame_or_series(arr) - msg = "passing float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - obj = frame_or_series(arr, dtype="i8") - tm.assert_equal(obj, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + frame_or_series(arr, dtype="i8") + + with pytest.raises(IntCastingNaNError, match=msg): # same behavior if we pass list instead of the ndarray - obj = frame_or_series(list(arr), dtype="i8") - tm.assert_equal(obj, expected) + frame_or_series(list(arr), dtype="i8") # float array that can be losslessly cast to integers arr = np.array([1.0, 2.0], dtype="float64") @@ -854,13 +853,13 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp # Updated: make sure we treat this list the same as we would treat the # equivalent ndarray vals = [1, 2, np.nan] - msg = "In a future version, passing float-dtype values containing NaN" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = Series(vals, dtype=any_int_numpy_dtype) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = Series(np.array(vals), dtype=any_int_numpy_dtype) - tm.assert_series_equal(res, expected) - assert np.isnan(expected.iloc[-1]) + # pre-2.0 this would return with a float dtype, in 2.0 we raise + + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + Series(vals, dtype=any_int_numpy_dtype) + with pytest.raises(IntCastingNaNError, match=msg): + Series(np.array(vals), dtype=any_int_numpy_dtype) def test_constructor_dtype_no_cast(self): # see gh-1572 diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index a7f4269fa62b1..1396ab262a79a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td import pandas as pd @@ -100,13 +101,13 @@ def test_construct_dask_float_array_int_dtype_match_ndarray(): expected = Series(arr, dtype="i8") tm.assert_series_equal(res, expected) - msg = "In a future version, passing float-dtype values containing NaN" + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" arr[2] = np.nan - with tm.assert_produces_warning(FutureWarning, match=msg): - res = Series(darr, dtype="i8") - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = Series(arr, dtype="i8") - tm.assert_series_equal(res, expected) + with pytest.raises(IntCastingNaNError, match=msg): + Series(darr, dtype="i8") + # which is the same as we get with a numpy input + with pytest.raises(IntCastingNaNError, match=msg): + Series(arr, dtype="i8") def test_xarray(df): From 56cef58915e3fe6df2a40b96872237f49e3c340a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Nov 2022 16:46:30 -0800 Subject: [PATCH 30/39] DEPR: Disallow missing nested label when indexing MultiIndex level (#49628) DEPR: Disallow missing nesed label when indexing MultiIndex level --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/indexes/multi.py | 38 ++++++-------------- pandas/tests/indexing/multiindex/test_loc.py | 13 +++---- pandas/tests/io/formats/style/test_style.py | 30 ++++++++-------- 4 files changed, 31 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f4a6a6277b6a1..f81660ba2136b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -452,6 +452,7 @@ Removal of prior version deprecations/changes - Enforced disallowing using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`) - Enforced disallowing the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) - Enforced disallowing a tuple of column labels into :meth:`.DataFrameGroupBy.__getitem__` (:issue:`30546`) +- Enforced disallowing missing labels when indexing with a sequence of labels on a level of a :class:`MultiIndex`. This now raises a ``KeyError`` (:issue:`42351`) - Enforced disallowing setting values with ``.loc`` using a positional slice. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) - Enforced disallowing positional indexing with a ``float`` key even if that key is a round number, manually cast to integer instead (:issue:`34193`) - Enforced disallowing using a :class:`DataFrame` indexer with ``.iloc``, use ``.loc`` instead for automatic alignment (:issue:`39022`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 41aefd46e0af6..b3378fb9abacd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3258,34 +3258,18 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: if not is_hashable(x): # e.g. slice raise err - try: - item_indexer = self._get_level_indexer( - x, level=i, indexer=indexer - ) - except KeyError: - # ignore not founds; see discussion in GH#39424 - warnings.warn( - "The behavior of indexing on a MultiIndex with a " - "nested sequence of labels is deprecated and will " - "change in a future version. " - "`series.loc[label, sequence]` will raise if any " - "members of 'sequence' or not present in " - "the index's second level. To retain the old " - "behavior, use `series.index.isin(sequence, level=1)`", - # TODO: how to opt in to the future behavior? - # TODO: how to handle IntervalIndex level? - # (no test cases) - FutureWarning, - stacklevel=find_stack_level(), - ) - continue + # GH 39424: Ignore not founds + # GH 42351: No longer ignore not founds & enforced in 2.0 + # TODO: how to handle IntervalIndex level? (no test cases) + item_indexer = self._get_level_indexer( + x, level=i, indexer=indexer + ) + if lvl_indexer is None: + lvl_indexer = _to_bool_indexer(item_indexer) + elif isinstance(item_indexer, slice): + lvl_indexer[item_indexer] = True # type: ignore[index] else: - if lvl_indexer is None: - lvl_indexer = _to_bool_indexer(item_indexer) - elif isinstance(item_indexer, slice): - lvl_indexer[item_indexer] = True # type: ignore[index] - else: - lvl_indexer |= item_indexer + lvl_indexer |= item_indexer if lvl_indexer is None: # no matches we are done diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index c25866c4f09e2..ac4bb1093d84a 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -443,15 +443,12 @@ def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, pos): if expected.size == 0 and indexer != []: with pytest.raises(KeyError, match=str(indexer)): ser.loc[indexer] + elif indexer == (slice(None), ["foo", "bah"]): + # "bah" is not in idx.levels[1], raising KeyError enforced in 2.0 + with pytest.raises(KeyError, match="'bah'"): + ser.loc[indexer] else: - warn = None - msg = "MultiIndex with a nested sequence" - if indexer == (slice(None), ["foo", "bah"]): - # "bah" is not in idx.levels[1], so is ignored, will raise KeyError - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - result = ser.loc[indexer] + result = ser.loc[indexer] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index c97505eacd4c4..32ab0336aa93f 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -1,3 +1,4 @@ +import contextlib import copy import re from textwrap import dedent @@ -701,26 +702,26 @@ def test_applymap_subset(self, slice_, df): def test_applymap_subset_multiindex(self, slice_): # GH 19861 # edited for GH 33562 - warn = None - msg = "indexing on a MultiIndex with a nested sequence of labels" if ( isinstance(slice_[-1], tuple) and isinstance(slice_[-1][-1], list) and "C" in slice_[-1][-1] ): - warn = FutureWarning + ctx = pytest.raises(KeyError, match="C") # noqa: PDF010 elif ( isinstance(slice_[0], tuple) and isinstance(slice_[0][1], list) and 3 in slice_[0][1] ): - warn = FutureWarning + ctx = pytest.raises(KeyError, match="3") # noqa: PDF010 + else: + ctx = contextlib.nullcontext() idx = MultiIndex.from_product([["a", "b"], [1, 2]]) col = MultiIndex.from_product([["x", "y"], ["A", "B"]]) df = DataFrame(np.random.rand(4, 4), columns=col, index=idx) - with tm.assert_produces_warning(warn, match=msg): + with ctx: df.style.applymap(lambda x: "color: red;", subset=slice_).to_html() def test_applymap_subset_multiindex_code(self): @@ -1390,7 +1391,7 @@ def test_non_reducing_slice_on_multiindex(self): IndexSlice[:, IndexSlice["a", :, "e"]], IndexSlice[:, IndexSlice[:, "c", "e"]], IndexSlice[:, IndexSlice["a", ["c", "d"], :]], # check list - IndexSlice[:, IndexSlice["a", ["c", "d", "-"], :]], # allow missing + IndexSlice[:, IndexSlice["a", ["c", "d", "-"], :]], # don't allow missing IndexSlice[:, IndexSlice["a", ["c", "d", "-"], "e"]], # no slice # check rows IndexSlice[IndexSlice[["U"]], :], # inferred deeper need list @@ -1399,7 +1400,7 @@ def test_non_reducing_slice_on_multiindex(self): IndexSlice[IndexSlice["U", :, "Y"], :], IndexSlice[IndexSlice[:, "W", "Y"], :], IndexSlice[IndexSlice[:, "W", ["Y", "Z"]], :], # check list - IndexSlice[IndexSlice[:, "W", ["Y", "Z", "-"]], :], # allow missing + IndexSlice[IndexSlice[:, "W", ["Y", "Z", "-"]], :], # don't allow missing IndexSlice[IndexSlice["U", "W", ["Y", "Z", "-"]], :], # no slice # check simultaneous IndexSlice[IndexSlice[:, "W", "Y"], IndexSlice["a", "c", :]], @@ -1411,21 +1412,18 @@ def test_non_reducing_multi_slice_on_multiindex(self, slice_): idxs = MultiIndex.from_product([["U", "V"], ["W", "X"], ["Y", "Z"]]) df = DataFrame(np.arange(64).reshape(8, 8), columns=cols, index=idxs) - msg = "indexing on a MultiIndex with a nested sequence of labels" - warn = None for lvl in [0, 1]: key = slice_[lvl] if isinstance(key, tuple): for subkey in key: if isinstance(subkey, list) and "-" in subkey: - # not present in the index level, ignored, will raise in future - warn = FutureWarning - - with tm.assert_produces_warning(warn, match=msg): - expected = df.loc[slice_] + # not present in the index level, raises KeyError since 2.0 + with pytest.raises(KeyError, match="-"): + df.loc[slice_] + return - with tm.assert_produces_warning(warn, match=msg): - result = df.loc[non_reducing_slice(slice_)] + expected = df.loc[slice_] + result = df.loc[non_reducing_slice(slice_)] tm.assert_frame_equal(result, expected) From 36936a32ba121c6485162500876153adf023f170 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 10 Nov 2022 20:39:41 -0500 Subject: [PATCH 31/39] BUG: groupby.nth should be a filter (#49262) --- doc/source/user_guide/groupby.rst | 38 +++--- doc/source/whatsnew/v2.0.0.rst | 56 ++++++++- pandas/core/groupby/base.py | 2 +- pandas/core/groupby/groupby.py | 131 +++++++------------- pandas/tests/groupby/test_categorical.py | 6 +- pandas/tests/groupby/test_function.py | 1 - pandas/tests/groupby/test_grouping.py | 2 + pandas/tests/groupby/test_nth.py | 151 +++++++++-------------- 8 files changed, 179 insertions(+), 208 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index dae42dd4f1118..d8a36b1711b6e 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1354,9 +1354,14 @@ This shows the first or last n rows from each group. Taking the nth row of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To select from a DataFrame or Series the nth item, use -:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and -will return a single row (or no row) per group if you pass an int for n: +To select the nth item from each group, use :meth:`.DataFrameGroupBy.nth` or +:meth:`.SeriesGroupBy.nth`. Arguments supplied can be any integer, lists of integers, +slices, or lists of slices; see below for examples. When the nth element of a group +does not exist an error is *not* raised; instead no corresponding rows are returned. + +In general this operation acts as a filtration. In certain cases it will also return +one row per group, making it also a reduction. However because in general it can +return zero or multiple rows per group, pandas treats it as a filtration in all cases. .. ipython:: python @@ -1367,6 +1372,14 @@ will return a single row (or no row) per group if you pass an int for n: g.nth(-1) g.nth(1) +If the nth element of a group does not exist, then no corresponding row is included +in the result. In particular, if the specified ``n`` is larger than any group, the +result will be an empty DataFrame. + +.. ipython:: python + + g.nth(5) + If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna: .. ipython:: python @@ -1376,21 +1389,11 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat g.first() # nth(-1) is the same as g.last() - g.nth(-1, dropna="any") # NaNs denote group exhausted when using dropna + g.nth(-1, dropna="any") g.last() g.B.nth(0, dropna="all") -As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. - -.. ipython:: python - - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - g = df.groupby("A", as_index=False) - - g.nth(0) - g.nth(-1) - You can also select multiple rows from each group by specifying multiple nth values as a list of ints. .. ipython:: python @@ -1400,6 +1403,13 @@ You can also select multiple rows from each group by specifying multiple nth val # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) +You may also use a slices or lists of slices. + +.. ipython:: python + + df.groupby([df.index.year, df.index.month]).nth[1:] + df.groupby([df.index.year, df.index.month]).nth[1:, :-1] + Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f81660ba2136b..715ba95eb950b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -72,7 +72,7 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_200.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_200.notable_bug_fixes.cumsum_cumprod_overflow: :meth:`.GroupBy.cumsum` and :meth:`.GroupBy.cumprod` overflow instead of lossy casting to float ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -102,10 +102,58 @@ We return incorrect results with the 6th value. We overflow with the 7th value, but the 6th value is still correct. -.. _whatsnew_200.notable_bug_fixes.notable_bug_fix2: +.. _whatsnew_200.notable_bug_fixes.groupby_nth_filter: -notable_bug_fix2 -^^^^^^^^^^^^^^^^ +:meth:`.DataFrameGroupBy.nth` and :meth:`.SeriesGroupBy.nth` now behave as filtrations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, :meth:`.DataFrameGroupBy.nth` and +:meth:`.SeriesGroupBy.nth` acted as if they were aggregations. However, for most +inputs ``n``, they may return either zero or multiple rows per group. This means +that they are filtrations, similar to e.g. :meth:`.DataFrameGroupBy.head`. pandas +now treats them as filtrations (:issue:`13666`). + +.. ipython:: python + + df = pd.DataFrame({"a": [1, 1, 2, 1, 2], "b": [np.nan, 2.0, 3.0, 4.0, 5.0]}) + gb = df.groupby("a") + +*Old Behavior* + +.. code-block:: ipython + + In [5]: gb.nth(n=1) + Out[5]: + A B + 1 1 2.0 + 4 2 5.0 + +*New Behavior* + +.. ipython:: python + + gb.nth(n=1) + +In particular, the index of the result is derived from the input by selecting +the appropriate rows. Also, when ``n`` is larger than the group, no rows instead of +``NaN`` is returned. + +*Old Behavior* + +.. code-block:: ipython + + In [5]: gb.nth(n=3, dropna="any") + Out[5]: + B + A + 1 NaN + 2 NaN + +*New Behavior* + +.. ipython:: python + + gb.nth(n=3, dropna="any") .. --------------------------------------------------------------------------- .. _whatsnew_200.api_breaking: diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index a2e9c059cbcc9..0f6d39be7d32f 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -37,7 +37,6 @@ class OutputKey: "mean", "median", "min", - "nth", "nunique", "prod", # as long as `quantile`'s signature accepts only @@ -100,6 +99,7 @@ class OutputKey: "indices", "ndim", "ngroups", + "nth", "ohlc", "pipe", "plot", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f1c18b7762f66..d10931586d5e0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2978,97 +2978,68 @@ def nth( ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) >>> g = df.groupby('A') >>> g.nth(0) - B - A - 1 NaN - 2 3.0 + A B + 0 1 NaN + 2 2 3.0 >>> g.nth(1) - B - A - 1 2.0 - 2 5.0 + A B + 1 1 2.0 + 4 2 5.0 >>> g.nth(-1) - B - A - 1 4.0 - 2 5.0 + A B + 3 1 4.0 + 4 2 5.0 >>> g.nth([0, 1]) - B - A - 1 NaN - 1 2.0 - 2 3.0 - 2 5.0 + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 >>> g.nth(slice(None, -1)) - B - A - 1 NaN - 1 2.0 - 2 3.0 + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 Index notation may also be used >>> g.nth[0, 1] - B - A - 1 NaN - 1 2.0 - 2 3.0 - 2 5.0 + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 + 4 2 5.0 >>> g.nth[:-1] - B - A - 1 NaN - 1 2.0 - 2 3.0 + A B + 0 1 NaN + 1 1 2.0 + 2 2 3.0 - Specifying `dropna` allows count ignoring ``NaN`` + Specifying `dropna` allows ignoring ``NaN`` values >>> g.nth(0, dropna='any') - B - A - 1 2.0 - 2 3.0 + A B + 1 1 2.0 + 2 2 3.0 - NaNs denote group exhausted when using dropna + When the specified ``n`` is larger than any of the groups, an + empty DataFrame is returned >>> g.nth(3, dropna='any') - B - A - 1 NaN - 2 NaN - - Specifying `as_index=False` in `groupby` keeps the original index. - - >>> df.groupby('A', as_index=False).nth(1) - A B - 1 1 2.0 - 4 2 5.0 + Empty DataFrame + Columns: [A, B] + Index: [] """ if not dropna: - with self._group_selection_context(): - mask = self._make_mask_from_positional_indexer(n) + mask = self._make_mask_from_positional_indexer(n) - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._mask_selected_obj(mask) - if not self.as_index: - return out - - result_index = self.grouper.result_index - if self.axis == 0: - out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) - - out = self._reindex_output(out) - else: - out.columns = result_index[ids[mask]] - - return out.sort_index(axis=self.axis) if self.sort else out + out = self._mask_selected_obj(mask) + return out # dropna is truthy if not is_integer(n): @@ -3085,7 +3056,6 @@ def nth( # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf n = cast(int, n) - max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) # get a new grouper for our dropped obj @@ -3115,22 +3085,7 @@ def nth( grb = dropped.groupby( grouper, as_index=self.as_index, sort=self.sort, axis=self.axis ) - sizes, result = grb.size(), grb.nth(n) - mask = (sizes < max_len)._values - - # set the results which don't meet the criteria - if len(result) and mask.any(): - result.loc[mask] = np.nan - - # reset/reindex to the original groups - if len(self.obj) == len(dropped) or len(result) == len( - self.grouper.result_index - ): - result.index = self.grouper.result_index - else: - result = result.reindex(self.grouper.result_index) - - return result + return grb.nth(n) @final def quantile( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1e2bcb58110dd..ca794d4ae5a3e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -563,11 +563,7 @@ def test_observed_nth(): df = DataFrame({"cat": cat, "ser": ser}) result = df.groupby("cat", observed=False)["ser"].nth(0) - - index = Categorical(["a", "b", "c"], categories=["a", "b", "c"]) - expected = Series([1, np.nan, np.nan], index=index, name="ser") - expected.index.name = "cat" - + expected = df["ser"].iloc[[0]] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5383a4d28c8ce..f05874c3286c7 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -405,7 +405,6 @@ def test_median_empty_bins(observed): ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), - ("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}), ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), ], ) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 1c8b8e3d33ecf..e3b7ad8f78750 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -851,6 +851,8 @@ def test_groupby_with_single_column(self): exp = DataFrame(index=Index(["a", "b", "s"], name="a")) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) + + exp = df.iloc[[3, 4, 5]] tm.assert_frame_equal(df.groupby("a").nth(1), exp) def test_gb_key_len_equal_axis_len(self): diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 187c80075f36b..de5025b998b30 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -23,6 +23,7 @@ def test_first_last_nth(df): tm.assert_frame_equal(first, expected) nth = grouped.nth(0) + expected = df.loc[[0, 1]] tm.assert_frame_equal(nth, expected) last = grouped.last() @@ -31,12 +32,11 @@ def test_first_last_nth(df): tm.assert_frame_equal(last, expected) nth = grouped.nth(-1) + expected = df.iloc[[5, 7]] tm.assert_frame_equal(nth, expected) nth = grouped.nth(1) - expected = df.loc[[2, 3], ["B", "C", "D"]].copy() - expected.index = Index(["foo", "bar"], name="A") - expected = expected.sort_index() + expected = df.iloc[[2, 3]] tm.assert_frame_equal(nth, expected) # it works! @@ -47,7 +47,7 @@ def test_first_last_nth(df): df.loc[df["A"] == "foo", "B"] = np.nan assert isna(grouped["B"].first()["foo"]) assert isna(grouped["B"].last()["foo"]) - assert isna(grouped["B"].nth(0)["foo"]) + assert isna(grouped["B"].nth(0).iloc[0]) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) @@ -56,7 +56,7 @@ def test_first_last_nth(df): expected = df.iloc[[1, 2]].set_index("A") tm.assert_frame_equal(result, expected) - expected = df.iloc[[1, 2]].set_index("A") + expected = df.iloc[[1, 2]] result = g.nth(0, dropna="any") tm.assert_frame_equal(result, expected) @@ -82,18 +82,10 @@ def test_first_last_with_na_object(method, nulls_fixture): @pytest.mark.parametrize("index", [0, -1]) def test_nth_with_na_object(index, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 - groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a") + df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}) + groups = df.groupby("a") result = groups.nth(index) - - if index == 0: - values = [1, 3] - else: - values = [2, nulls_fixture] - - values = np.array(values, dtype=result["b"].dtype) - idx = Index([1, 2], name="a") - expected = DataFrame({"b": values}, index=idx) - + expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]] tm.assert_frame_equal(result, expected) @@ -149,9 +141,7 @@ def test_first_last_nth_dtypes(df_mixed_floats): tm.assert_frame_equal(last, expected) nth = grouped.nth(1) - expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]] - expected.index = Index(["bar", "foo"], name="A") - expected = expected.sort_index() + expected = df.iloc[[2, 3]] tm.assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes @@ -166,11 +156,13 @@ def test_first_last_nth_dtypes(df_mixed_floats): def test_first_last_nth_nan_dtype(): # GH 33591 df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)}) - grouped = df.groupby("data") + expected = df.set_index("data").nans tm.assert_series_equal(grouped.nans.first(), expected) tm.assert_series_equal(grouped.nans.last(), expected) + + expected = df.nans tm.assert_series_equal(grouped.nans.nth(-1), expected) tm.assert_series_equal(grouped.nans.nth(0), expected) @@ -198,23 +190,21 @@ def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") - tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A")) - tm.assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A")) - tm.assert_frame_equal(g.nth(2), df.loc[[]].set_index("A")) - tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A")) - tm.assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A")) - tm.assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A")) - tm.assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]]) - tm.assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]]) - tm.assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A")) + tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) + tm.assert_frame_equal(g.nth(1), df.iloc[[1]]) + tm.assert_frame_equal(g.nth(2), df.loc[[]]) + tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(-2), df.iloc[[0]]) + tm.assert_frame_equal(g.nth(-3), df.loc[[]]) + tm.assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) + tm.assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + tm.assert_frame_equal(g[["B"]].nth(0), df[["B"]].iloc[[0, 2]]) - exp = df.set_index("A") - tm.assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]]) - tm.assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(0, dropna="any"), df.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(-1, dropna="any"), df.iloc[[1, 2]]) - exp["B"] = np.nan - tm.assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]]) - tm.assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]]) + tm.assert_frame_equal(g.nth(7, dropna="any"), df.iloc[:0]) + tm.assert_frame_equal(g.nth(2, dropna="any"), df.iloc[:0]) # out of bounds, regression from 0.13.1 # GH 6621 @@ -263,13 +253,6 @@ def test_nth(): assert expected.iloc[0] == v assert expected2.iloc[0] == v - # this is NOT the same as .first (as sorted is default!) - # as it keeps the order in the series (and not the group order) - # related GH 7287 - expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna="all") - tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match="For a DataFrame"): s.groupby(g, sort=False).nth(0, dropna=True) @@ -277,21 +260,21 @@ def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") result = g.B.nth(0, dropna="all") - expected = g.B.first() + expected = df.B.iloc[[1, 2]] tm.assert_series_equal(result, expected) # test multiple nth values df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) g = df.groupby("A") - tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A")) - tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A")) - tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A")) - tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A")) - tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) - tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) - tm.assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A")) - tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A")) + tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]]) + tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]]) + tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]]) + tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]]) + tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]]) + tm.assert_frame_equal(g.nth([2]), df.iloc[[2]]) + tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]]) business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") df = DataFrame(1, index=business_dates, columns=["a", "b"]) @@ -318,12 +301,12 @@ def test_nth(): tm.assert_frame_equal(result, expected) -def test_nth_multi_index(three_group): +def test_nth_multi_grouper(three_group): # PR 9090, related to issue 8979 - # test nth on MultiIndex, should match .first() + # test nth on multiple groupers grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) - expected = grouped.first() + expected = three_group.iloc[[0, 3, 4, 7]] tm.assert_frame_equal(result, expected) @@ -504,13 +487,7 @@ def test_nth_multi_index_as_expected(): ) grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) - expected = DataFrame( - {"C": ["dull", "dull", "dull", "dull"]}, - index=MultiIndex.from_arrays( - [["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]], - names=["A", "B"], - ), - ) + expected = three_group.iloc[[0, 3, 4, 7]] tm.assert_frame_equal(result, expected) @@ -567,7 +544,7 @@ def test_groupby_head_tail_axis_1(op, n, expected_cols): def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) - expected = df.iloc[[0, 2]].set_index("A") + expected = df.iloc[[0, 2]] g = df.groupby("A") result1 = g.head(n=2) @@ -598,13 +575,11 @@ def test_nth_empty(): # GH 16064 df = DataFrame(index=[0], columns=["a", "b", "c"]) result = df.groupby("a").nth(10) - expected = DataFrame(index=Index([], name="a"), columns=["b", "c"]) + expected = df.iloc[:0] tm.assert_frame_equal(result, expected) result = df.groupby(["a", "b"]).nth(10) - expected = DataFrame( - index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"] - ) + expected = df.iloc[:0] tm.assert_frame_equal(result, expected) @@ -616,15 +591,11 @@ def test_nth_column_order(): columns=["A", "C", "B"], ) result = df.groupby("A").nth(0) - expected = DataFrame( - [["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A") - ) + expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.groupby("A").nth(-1, dropna="any") - expected = DataFrame( - [["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A") - ) + expected = df.iloc[[1, 4]] tm.assert_frame_equal(result, expected) @@ -636,9 +607,7 @@ def test_nth_nan_in_grouper(dropna): columns=list("abc"), ) result = df.groupby("a").nth(0, dropna=dropna) - expected = DataFrame( - [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") - ) + expected = df.iloc[[1, 3]] tm.assert_frame_equal(result, expected) @@ -772,29 +741,21 @@ def test_groupby_nth_with_column_axis(): columns=["C", "B", "A"], ) result = df.groupby(df.iloc[1], axis=1).nth(0) - expected = DataFrame( - [ - [6, 4], - [7, 8], - ], - index=["z", "y"], - columns=[7, 8], - ) - expected.columns.name = "y" + expected = df.iloc[:, [0, 2]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "start, stop, expected_values, expected_columns", [ - (None, None, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]), - (None, 1, [0, 3], [5, 6]), - (None, 9, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]), - (None, -1, [0, 1, 3], [5, 5, 6]), - (1, None, [1, 2, 4], [5, 5, 6]), - (1, -1, [1], [5]), - (-1, None, [2, 4], [5, 6]), - (-1, 2, [4], [6]), + (None, None, [0, 1, 2, 3, 4], list("ABCDE")), + (None, 1, [0, 3], list("AD")), + (None, 9, [0, 1, 2, 3, 4], list("ABCDE")), + (None, -1, [0, 1, 3], list("ABD")), + (1, None, [1, 2, 4], list("BCE")), + (1, -1, [1], list("B")), + (-1, None, [2, 4], list("CE")), + (-1, 2, [4], list("E")), ], ) @pytest.mark.parametrize("method", ["call", "index"]) @@ -807,7 +768,7 @@ def test_nth_slices_with_column_axis( "call": lambda start, stop: gb.nth(slice(start, stop)), "index": lambda start, stop: gb.nth[start:stop], }[method](start, stop) - expected = DataFrame([expected_values], columns=expected_columns) + expected = DataFrame([expected_values], columns=[expected_columns]) tm.assert_frame_equal(result, expected) @@ -824,7 +785,7 @@ def test_head_tail_dropna_true(): result = df.groupby(["X", "Y"]).tail(n=1) tm.assert_frame_equal(result, expected) - result = df.groupby(["X", "Y"]).nth(n=0).reset_index() + result = df.groupby(["X", "Y"]).nth(n=0) tm.assert_frame_equal(result, expected) @@ -839,5 +800,5 @@ def test_head_tail_dropna_false(): result = df.groupby(["X", "Y"], dropna=False).tail(n=1) tm.assert_frame_equal(result, expected) - result = df.groupby(["X", "Y"], dropna=False).nth(n=0).reset_index() + result = df.groupby(["X", "Y"], dropna=False).nth(n=0) tm.assert_frame_equal(result, expected) From 0c55e1888b4de0185ce4b95c9744d4ae0ec05222 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 11 Nov 2022 10:06:42 +0700 Subject: [PATCH 32/39] CI: Updating website sync to new server (#49614) --- .github/workflows/docbuild-and-upload.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 48a08d4febbaf..1db8fb9a70254 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -64,22 +64,22 @@ jobs: mkdir -m 700 -p ~/.ssh echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts + echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFjYkJBk7sos+r7yATODogQc3jUdW1aascGpyOD4bohj8dWjzwLJv/OJ/fyOQ5lmj81WKDk67tGtqNJYGL9acII=" > ~/.ssh/known_hosts if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) - name: Copy cheatsheets into site directory run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs - run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/dev if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload prod docs - run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/version/${GITHUB_REF_NAME:1} + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/version/${GITHUB_REF_NAME:1} if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') - name: Move docs into site directory From 4cd9b6a7608d0af2ec616845f7da225a7153e84a Mon Sep 17 00:00:00 2001 From: ram vikram singh Date: Fri, 11 Nov 2022 15:54:36 +0530 Subject: [PATCH 33/39] for #49638 updated the doc (#49639) added the `git fetch upstream` command --- doc/source/development/contributing.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 6ed8dee044d1f..b230516c9c3e5 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -137,6 +137,7 @@ want to clone your fork to your machine:: git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream This creates the directory ``pandas-yourname`` and connects your repository to the upstream (main project) *pandas* repository. From 72b92d3d4a6ee0bd344034e8effb985e056f8ab5 Mon Sep 17 00:00:00 2001 From: ram vikram singh Date: Fri, 11 Nov 2022 23:15:17 +0530 Subject: [PATCH 34/39] for gh-49508 changing Doc for DataFrame.astype (#49556) * for #49508 changing Doc for DataFrame.astype added the series in input in the doc of DataFrame.astype * up * up2 * up3 * up4 * up5 --- pandas/core/generic.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d26a11eae9f7f..09ecb2d331d70 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6004,11 +6004,12 @@ def astype( Parameters ---------- - dtype : data type, or dict of column name -> data type - Use a numpy.dtype or Python type to cast entire pandas object to - the same type. Alternatively, use {col: dtype, ...}, where col is a - column label and dtype is a numpy.dtype or Python type to cast one - or more of the DataFrame's columns to column-specific types. + dtype : str, data type, Series or Mapping of column name -> data type + Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to + cast entire pandas object to the same type. Alternatively, use a + mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is + a numpy.dtype or Python type to cast one or more of the DataFrame's + columns to column-specific types. copy : bool, default True Return a copy when ``copy=True`` (be very careful setting ``copy=False`` as changes to values then may propagate to other From f52331f7ac546fcead32982606b54b811181b05a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 11 Nov 2022 09:48:45 -0800 Subject: [PATCH 35/39] DEPR: Remove df.reduction(level) (#49611) * DEPR: Remove df.reduction(level) * test_*_consistency * Fix asv * Add issue ref --- asv_bench/benchmarks/frame_methods.py | 4 +- asv_bench/benchmarks/stat_ops.py | 20 +- doc/source/whatsnew/v0.15.2.rst | 10 +- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/frame.py | 64 +------ pandas/core/generic.py | 174 +++--------------- pandas/core/groupby/generic.py | 5 - pandas/core/series.py | 43 +---- .../test_count_with_level_deprecated.py | 123 ------------- pandas/tests/frame/test_reductions.py | 74 -------- pandas/tests/frame/test_subclass.py | 5 +- pandas/tests/generic/test_finalize.py | 6 - pandas/tests/groupby/test_allowlist.py | 19 +- pandas/tests/groupby/test_api_consistency.py | 20 +- pandas/tests/groupby/test_groupby.py | 4 - pandas/tests/reductions/test_reductions.py | 56 +----- .../tests/reductions/test_stat_reductions.py | 9 +- pandas/tests/series/methods/test_count.py | 68 ------- pandas/tests/series/test_reductions.py | 14 +- pandas/tests/test_multilevel.py | 129 +------------ 20 files changed, 70 insertions(+), 778 deletions(-) delete mode 100644 pandas/tests/frame/methods/test_count_with_level_deprecated.py diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5bb87b8bb2663..9a5fc1c607f6a 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -454,10 +454,10 @@ def setup(self, axis): ) def time_count_level_multi(self, axis): - self.df.count(axis=axis, level=1) + self.df.count(axis=axis) def time_count_level_mixed_dtypes_multi(self, axis): - self.df_mixed.count(axis=axis, level=1) + self.df_mixed.count(axis=axis) class Apply: diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 19fa7f7a06cf2..09244b31fbba7 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -23,10 +23,10 @@ def time_op(self, op, dtype, axis): class FrameMultiIndexOps: - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] + params = [ops] + param_names = ["op"] - def setup(self, level, op): + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -37,8 +37,8 @@ def setup(self, level, op): df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) - def time_op(self, level, op): - self.df_func(level=level) + def time_op(self, op): + self.df_func() class SeriesOps: @@ -56,10 +56,10 @@ def time_op(self, op, dtype): class SeriesMultiIndexOps: - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] + params = [ops] + param_names = ["op"] - def setup(self, level, op): + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -70,8 +70,8 @@ def setup(self, level, op): s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) - def time_op(self, level, op): - self.s_func(level=level) + def time_op(self, op): + self.s_func() class Rank: diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index fd4946c9765e1..bb7beef449d93 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -154,11 +154,13 @@ Other enhancements: - ``Series.all`` and ``Series.any`` now support the ``level`` and ``skipna`` parameters (:issue:`8302`): - .. ipython:: python - :okwarning: + .. code-block:: python - s = pd.Series([False, True, False], index=[0, 0, 1]) - s.any(level=0) + >>> s = pd.Series([False, True, False], index=[0, 0, 1]) + >>> s.any(level=0) + 0 True + 1 False + dtype: bool - ``Panel`` now supports the ``all`` and ``any`` aggregation functions. (:issue:`8302`): diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 715ba95eb950b..a2a20956e42bd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -469,6 +469,7 @@ Removal of prior version deprecations/changes - Removed :meth:`.Rolling.validate`, :meth:`.Expanding.validate`, and :meth:`.ExponentialMovingWindow.validate` (:issue:`43665`) - Removed :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) - Removed :attr:`Rolling.is_datetimelike` (:issue:`38963`) +- Removed the ``level`` keyword in :class:`DataFrame` and :class:`Series` aggregations; use ``groupby`` instead (:issue:`39983`) - Removed deprecated :meth:`Timedelta.delta`, :meth:`Timedelta.is_populated`, and :attr:`Timedelta.freq` (:issue:`46430`, :issue:`46476`) - Removed deprecated :attr:`NaT.freq` (:issue:`45071`) - Removed deprecated :meth:`Categorical.replace`, use :meth:`Series.replace` instead (:issue:`44929`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1627a7add25ed..507e14c5616a2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,7 +118,6 @@ maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( - ensure_platform_int, infer_dtype_from_object, is_1d_only_ea_dtype, is_bool_dtype, @@ -10331,7 +10330,7 @@ def c(x): # ---------------------------------------------------------------------- # ndarray-like stats methods - def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False): + def count(self, axis: Axis = 0, numeric_only: bool = False): """ Count non-NA cells for each column or row. @@ -10343,10 +10342,6 @@ def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False) axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index' counts are generated for each column. If 1 or 'columns' counts are generated for each row. - level : int or str, optional - If the axis is a `MultiIndex` (hierarchical), count along a - particular `level`, collapsing into a `DataFrame`. - A `str` specifies the level name. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -10400,16 +10395,6 @@ def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False) dtype: int64 """ axis = self._get_axis_number(axis) - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.count(level=1) should use df.groupby(level=1).count().", - FutureWarning, - stacklevel=find_stack_level(), - ) - res = self._count_level(level, axis=axis, numeric_only=numeric_only) - return res.__finalize__(self, method="count") if numeric_only: frame = self._get_numeric_data() @@ -10434,53 +10419,6 @@ def count(self, axis: Axis = 0, level: Level = None, numeric_only: bool = False) return result.astype("int64").__finalize__(self, method="count") - def _count_level(self, level: Level, axis: AxisInt = 0, numeric_only: bool = False): - if numeric_only: - frame = self._get_numeric_data() - else: - frame = self - - count_axis = frame._get_axis(axis) - agg_axis = frame._get_agg_axis(axis) - - if not isinstance(count_axis, MultiIndex): - raise TypeError( - f"Can only count levels on hierarchical {self._get_axis_name(axis)}." - ) - - # Mask NaNs: Mask rows or columns where the index level is NaN, and all - # values in the DataFrame that are NaN - if frame._is_mixed_type: - # Since we have mixed types, calling notna(frame.values) might - # upcast everything to object - values_mask = notna(frame).values - else: - # But use the speedup when we have homogeneous dtypes - values_mask = notna(frame.values) - - index_mask = notna(count_axis.get_level_values(level=level)) - if axis == 1: - mask = index_mask & values_mask - else: - mask = index_mask.reshape(-1, 1) & values_mask - - if isinstance(level, int): - level_number = level - else: - level_number = count_axis._get_level_number(level) - - level_name = count_axis._names[level_number] - level_index = count_axis.levels[level_number]._rename(name=level_name) - level_codes = ensure_platform_int(count_axis.codes[level_number]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=axis) - - if axis == 1: - result = self._constructor(counts, index=agg_axis, columns=level_index) - else: - result = self._constructor(counts, index=level_index, columns=agg_axis) - - return result - def _reduce( self, op, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 09ecb2d331d70..fa6ede80b6676 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10775,25 +10775,6 @@ def pct_change( rs = rs.reindex_like(data) return rs.__finalize__(self, method="pct_change") - @final - def _agg_by_level( - self, - name: str, - axis: Axis = 0, - level: Level = 0, - skipna: bool_t = True, - **kwargs, - ): - if axis is None: - raise ValueError("Must specify 'axis' when aggregating by level.") - grouped = self.groupby(level=level, axis=axis, sort=False) - if hasattr(grouped, name) and skipna: - return getattr(grouped, name)(**kwargs) - axis = self._get_axis_number(axis) - method = getattr(type(self), name) - applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) - return grouped.aggregate(applyf) - @final def _logical_func( self, @@ -10802,24 +10783,10 @@ def _logical_func( axis: Axis = 0, bool_only: bool_t = False, skipna: bool_t = True, - level: Level | None = None, **kwargs, ) -> Series | bool_t: nv.validate_logical_func((), kwargs, fname=name) validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.any(level=1) should use df.groupby(level=1).any()", - FutureWarning, - stacklevel=find_stack_level(), - ) - if bool_only: - raise NotImplementedError( - "Option bool_only is not implemented with option level." - ) - return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) if self.ndim > 1 and axis is None: # Reduce along one dimension then the other, to simplify DataFrame._reduce @@ -10856,11 +10823,10 @@ def any( axis: Axis = 0, bool_only: bool_t = False, skipna: bool_t = True, - level: Level | None = None, **kwargs, ) -> DataFrame | Series | bool_t: return self._logical_func( - "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs + "any", nanops.nanany, axis, bool_only, skipna, **kwargs ) def all( @@ -10868,11 +10834,10 @@ def all( axis: Axis = 0, bool_only: bool_t = False, skipna: bool_t = True, - level: Level | None = None, **kwargs, ) -> Series | bool_t: return self._logical_func( - "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs + "all", nanops.nanall, axis, bool_only, skipna, **kwargs ) @final @@ -10931,7 +10896,6 @@ def _stat_function_ddof( func, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, @@ -10940,17 +10904,7 @@ def _stat_function_ddof( validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: axis = self._stat_axis_number - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.var(level=1) should use df.groupby(level=1).var().", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, ddof=ddof - ) + return self._reduce( func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof ) @@ -10959,39 +10913,36 @@ def sem( self, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( - "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs + "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs ) def var( self, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( - "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs + "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs ) def std( self, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function_ddof( - "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs + "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs ) @final @@ -11001,7 +10952,6 @@ def _stat_function( func, axis: Axis | None | lib.NoDefault = None, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ): @@ -11012,7 +10962,7 @@ def _stat_function( validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None and level is None and self.ndim > 1: + if axis is None and self.ndim > 1: # user must have explicitly passed axis=None # GH#21597 warnings.warn( @@ -11027,17 +10977,7 @@ def _stat_function( if axis is None: axis = self._stat_axis_number - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.median(level=1) should use df.groupby(level=1).median().", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only - ) + return self._reduce( func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) @@ -11046,7 +10986,6 @@ def min( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ): @@ -11055,7 +10994,6 @@ def min( nanops.nanmin, axis, skipna, - level, numeric_only, **kwargs, ) @@ -11064,7 +11002,6 @@ def max( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ): @@ -11073,7 +11010,6 @@ def max( nanops.nanmax, axis, skipna, - level, numeric_only, **kwargs, ) @@ -11082,48 +11018,44 @@ def mean( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( - "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs + "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs ) def median( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( - "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs + "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs ) def skew( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( - "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs + "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs ) def kurt( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, **kwargs, ) -> Series | float: return self._stat_function( - "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs + "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs ) kurtosis = kurt @@ -11135,7 +11067,6 @@ def _min_count_stat_function( func, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, min_count: int = 0, **kwargs, @@ -11151,22 +11082,6 @@ def _min_count_stat_function( if axis is None: axis = self._stat_axis_number - if level is not None: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. df.sum(level=1) should use df.groupby(level=1).sum().", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._agg_by_level( - name, - axis=axis, - level=level, - skipna=skipna, - min_count=min_count, - numeric_only=numeric_only, - ) return self._reduce( func, @@ -11181,20 +11096,18 @@ def sum( self, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): return self._min_count_stat_function( - "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs + "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs ) def prod( self, axis: Axis | None = None, skipna: bool_t = True, - level: Level | None = None, numeric_only: bool_t = False, min_count: int = 0, **kwargs, @@ -11204,7 +11117,6 @@ def prod( nanops.nanprod, axis, skipna, - level, numeric_only, min_count, **kwargs, @@ -11235,7 +11147,6 @@ def any( axis: Axis = 0, bool_only=None, skipna: bool_t = True, - level=None, **kwargs, ): return NDFrame.any( @@ -11243,7 +11154,6 @@ def any( axis=axis, bool_only=bool_only, skipna=skipna, - level=level, **kwargs, ) @@ -11264,10 +11174,9 @@ def all( axis: Axis = 0, bool_only=None, skipna: bool_t = True, - level=None, **kwargs, ): - return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs) + return NDFrame.all(self, axis, bool_only, skipna, **kwargs) setattr(cls, "all", all) @@ -11286,12 +11195,11 @@ def sem( self, axis: Axis | None = None, skipna: bool_t = True, - level=None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) + return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs) setattr(cls, "sem", sem) @@ -11309,12 +11217,11 @@ def var( self, axis: Axis | None = None, skipna: bool_t = True, - level=None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) + return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs) setattr(cls, "var", var) @@ -11333,12 +11240,11 @@ def std( self, axis: Axis | None = None, skipna: bool_t = True, - level=None, ddof: int = 1, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) + return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs) setattr(cls, "std", std) @@ -11422,14 +11328,11 @@ def sum( self, axis: Axis | None = None, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): - return NDFrame.sum( - self, axis, skipna, level, numeric_only, min_count, **kwargs - ) + return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs) setattr(cls, "sum", sum) @@ -11447,14 +11350,11 @@ def prod( self, axis: Axis | None = None, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, min_count: int = 0, **kwargs, ): - return NDFrame.prod( - self, axis, skipna, level, numeric_only, min_count, **kwargs - ) + return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs) setattr(cls, "prod", prod) cls.product = prod @@ -11473,11 +11373,10 @@ def mean( self, axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "mean", mean) @@ -11495,11 +11394,10 @@ def skew( self, axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "skew", skew) @@ -11520,11 +11418,10 @@ def kurt( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "kurt", kurt) cls.kurtosis = kurt @@ -11543,11 +11440,10 @@ def median( self, axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.median(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "median", median) @@ -11567,11 +11463,10 @@ def max( self, axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.max(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "max", max) @@ -11591,11 +11486,10 @@ def min( self, axis: AxisInt | None | lib.NoDefault = lib.no_default, skipna: bool_t = True, - level=None, numeric_only: bool_t = False, **kwargs, ): - return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) + return NDFrame.min(self, axis, skipna, numeric_only, **kwargs) setattr(cls, "min", min) @@ -11821,12 +11715,6 @@ def _doc_params(cls): For `Series` this parameter is unused and defaults to 0. skipna : bool, default True Exclude NA/null values when computing the result. -level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. - - .. deprecated:: 1.3.0 - The level keyword is deprecated. Use groupby instead. numeric_only : bool, default False Include only float, int, boolean columns. Not implemented for Series. @@ -11851,12 +11739,6 @@ def _doc_params(cls): skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. -level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. - - .. deprecated:: 1.3.0 - The level keyword is deprecated. Use groupby instead. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. @@ -11954,12 +11836,6 @@ def _doc_params(cls): True, then the result will be {empty_value}, as for an empty row/column. If skipna is False, then NA are treated as True, because these are not equal to zero. -level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. - - .. deprecated:: 1.3.0 - The level keyword is deprecated. Use groupby instead. **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec9c8564ab549..571559dc838f5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -40,7 +40,6 @@ CorrelationMethod, FillnaOptions, IndexLabel, - Level, Manager, Manager2D, SingleManager, @@ -864,7 +863,6 @@ def skew( self, axis: Axis | lib.NoDefault = lib.no_default, skipna: bool = True, - level: Level | None = None, numeric_only: bool | None = None, **kwargs, ) -> Series: @@ -872,7 +870,6 @@ def skew( "skew", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) @@ -2242,7 +2239,6 @@ def skew( self, axis: Axis | None | lib.NoDefault = lib.no_default, skipna: bool = True, - level: Level | None = None, numeric_only: bool | lib.NoDefault = lib.no_default, **kwargs, ) -> DataFrame: @@ -2250,7 +2246,6 @@ def skew( "skew", axis=axis, skipna=skipna, - level=level, numeric_only=numeric_only, **kwargs, ) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2664988a7b8d4..44732b9060ff9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1992,16 +1992,10 @@ def groupby( # Statistics, overridden ndarray methods # TODO: integrate bottleneck - def count(self, level: Level = None): + def count(self): """ Return number of non-NA/null observations in the Series. - Parameters - ---------- - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a smaller Series. - Returns ------- int or Series (if level specified) @@ -2017,40 +2011,7 @@ def count(self, level: Level = None): >>> s.count() 2 """ - if level is None: - return notna(self._values).sum().astype("int64") - else: - warnings.warn( - "Using the level keyword in DataFrame and Series aggregations is " - "deprecated and will be removed in a future version. Use groupby " - "instead. ser.count(level=1) should use ser.groupby(level=1).count().", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not isinstance(self.index, MultiIndex): - raise ValueError("Series.count level is only valid with a MultiIndex") - - index = self.index - assert isinstance(index, MultiIndex) # for mypy - - if isinstance(level, str): - level = index._get_level_number(level) - - lev = index.levels[level] - level_codes = np.array(index.codes[level], subok=False, copy=True) - - mask = level_codes == -1 - if mask.any(): - level_codes[mask] = cnt = len(lev) - lev = lev.insert(cnt, lev._na_value) - - obs = level_codes[notna(self._values)] - # error: Argument "minlength" to "bincount" has incompatible type - # "Optional[int]"; expected "SupportsIndex" - out = np.bincount(obs, minlength=len(lev) or None) # type: ignore[arg-type] - return self._constructor(out, index=lev, dtype="int64").__finalize__( - self, method="count" - ) + return notna(self._values).sum().astype("int64") def mode(self, dropna: bool = True) -> Series: """ diff --git a/pandas/tests/frame/methods/test_count_with_level_deprecated.py b/pandas/tests/frame/methods/test_count_with_level_deprecated.py deleted file mode 100644 index f6fbc281c7a8e..0000000000000 --- a/pandas/tests/frame/methods/test_count_with_level_deprecated.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import pytest - -from pandas import ( - DataFrame, - Index, - Series, -) -import pandas._testing as tm - - -class TestDataFrameCount: - def test_count_multiindex(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - - frame = frame.copy() - frame.index.names = ["a", "b"] - - with tm.assert_produces_warning(FutureWarning): - result = frame.count(level="b") - with tm.assert_produces_warning(FutureWarning): - expected = frame.count(level=1) - tm.assert_frame_equal(result, expected, check_names=False) - - with tm.assert_produces_warning(FutureWarning): - result = frame.count(level="a") - with tm.assert_produces_warning(FutureWarning): - expected = frame.count(level=0) - tm.assert_frame_equal(result, expected, check_names=False) - - msg = "Level x not found" - with pytest.raises(KeyError, match=msg): - with tm.assert_produces_warning(FutureWarning): - frame.count(level="x") - - def test_count_level_corner(self, multiindex_dataframe_random_data): - frame = multiindex_dataframe_random_data - - ser = frame["A"][:0] - with tm.assert_produces_warning(FutureWarning): - result = ser.count(level=0) - expected = Series(0, index=ser.index.levels[0], name="A") - tm.assert_series_equal(result, expected) - - df = frame[:0] - with tm.assert_produces_warning(FutureWarning): - result = df.count(level=0) - expected = ( - DataFrame( - index=ser.index.levels[0].set_names(["first"]), columns=df.columns - ) - .fillna(0) - .astype(np.int64) - ) - tm.assert_frame_equal(result, expected) - - def test_count_index_with_nan(self): - # https://github.com/pandas-dev/pandas/issues/21824 - df = DataFrame( - { - "Person": ["John", "Myla", None, "John", "Myla"], - "Age": [24.0, 5, 21.0, 33, 26], - "Single": [False, True, True, True, False], - } - ) - - # count on row labels - with tm.assert_produces_warning(FutureWarning): - res = df.set_index(["Person", "Single"]).count(level="Person") - expected = DataFrame( - index=Index(["John", "Myla"], name="Person"), - columns=Index(["Age"]), - data=[2, 2], - ) - tm.assert_frame_equal(res, expected) - - # count on column labels - with tm.assert_produces_warning(FutureWarning): - res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1) - expected = DataFrame( - columns=Index(["John", "Myla"], name="Person"), - index=Index(["Age"]), - data=[[2, 2]], - ) - tm.assert_frame_equal(res, expected) - - def test_count_level( - self, - multiindex_year_month_day_dataframe_random_data, - multiindex_dataframe_random_data, - ): - ymd = multiindex_year_month_day_dataframe_random_data - frame = multiindex_dataframe_random_data - - def _check_counts(frame, axis=0): - index = frame._get_axis(axis) - for i in range(index.nlevels): - with tm.assert_produces_warning(FutureWarning): - result = frame.count(axis=axis, level=i) - expected = frame.groupby(axis=axis, level=i).count() - expected = expected.reindex_like(result).astype("i8") - tm.assert_frame_equal(result, expected) - - frame.iloc[1, [1, 2]] = np.nan - frame.iloc[7, [0, 1]] = np.nan - ymd.iloc[1, [1, 2]] = np.nan - ymd.iloc[7, [0, 1]] = np.nan - - _check_counts(frame) - _check_counts(ymd) - _check_counts(frame.T, axis=1) - _check_counts(ymd.T, axis=1) - - # can't call with level on regular DataFrame - df = tm.makeTimeDataFrame() - with pytest.raises(TypeError, match="hierarchical"): - with tm.assert_produces_warning(FutureWarning): - df.count(level=0) - - frame["D"] = "foo" - with tm.assert_produces_warning(FutureWarning): - result = frame.count(level=0, numeric_only=True) - tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0e5c6057b9a61..b7474060a7e8a 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -18,7 +18,6 @@ Categorical, DataFrame, Index, - MultiIndex, Series, Timestamp, date_range, @@ -493,21 +492,6 @@ def test_sem(self, datetime_frame): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - @td.skip_if_no_scipy - def test_kurt(self): - index = MultiIndex( - levels=[["bar"], ["one", "two", "three"], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - ) - df = DataFrame(np.random.randn(6, 3), index=index) - - kurt = df.kurt() - with tm.assert_produces_warning(FutureWarning): - kurt2 = df.kurt(level=0).xs("bar") - tm.assert_series_equal(kurt, kurt2, check_names=False) - assert kurt.name is None - assert kurt2.name == "bar" - @pytest.mark.parametrize( "dropna, expected", [ @@ -1316,19 +1300,6 @@ def test_any_all_object_bool_only(self): assert df.any(bool_only=True, axis=None) - @pytest.mark.parametrize("method", ["any", "all"]) - def test_any_all_level_axis_none_raises(self, method): - df = DataFrame( - {"A": 1}, - index=MultiIndex.from_product( - [["A", "B"], ["a", "b"]], names=["out", "in"] - ), - ) - xpr = "Must specify 'axis' when aggregating by level." - with pytest.raises(ValueError, match=xpr): - with tm.assert_produces_warning(FutureWarning): - getattr(df, method)(axis=None, level="out") - # --------------------------------------------------------------------- # Unsorted @@ -1440,25 +1411,6 @@ def test_preserve_timezone(self, initial: str, method): result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) - def test_frame_any_all_with_level(self): - df = DataFrame( - {"data": [False, False, True, False, True, False, True]}, - index=[ - ["one", "one", "two", "one", "two", "two", "two"], - [0, 1, 0, 2, 1, 2, 3], - ], - ) - - with tm.assert_produces_warning(FutureWarning, match="Using the level"): - result = df.any(level=0) - ex = DataFrame({"data": [False, True]}, index=["one", "two"]) - tm.assert_frame_equal(result, ex) - - with tm.assert_produces_warning(FutureWarning, match="Using the level"): - result = df.all(level=0) - ex = DataFrame({"data": [False, False]}, index=["one", "two"]) - tm.assert_frame_equal(result, ex) - def test_frame_any_with_timedelta(self): # GH#17667 df = DataFrame( @@ -1476,16 +1428,6 @@ def test_frame_any_with_timedelta(self): expected = Series(data=[False, True]) tm.assert_series_equal(result, expected) - def test_reductions_deprecation_level_argument( - self, frame_or_series, reduction_functions - ): - # GH#39983 - obj = frame_or_series( - [1, 2, 3], index=MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]]) - ) - with tm.assert_produces_warning(FutureWarning, match="level"): - getattr(obj, reduction_functions)(level=0) - def test_reductions_skipna_none_raises( self, request, frame_or_series, reduction_functions ): @@ -1638,22 +1580,6 @@ def test_minmax_extensionarray(method, numeric_only): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("meth", ["max", "min", "sum", "mean", "median"]) -def test_groupby_regular_arithmetic_equivalent(meth): - # GH#40660 - df = DataFrame( - {"a": [pd.Timedelta(hours=6), pd.Timedelta(hours=7)], "b": [12.1, 13.3]} - ) - expected = df.copy() - - with tm.assert_produces_warning(FutureWarning): - result = getattr(df, meth)(level=0) - tm.assert_frame_equal(result, expected) - - result = getattr(df.groupby(level=0), meth)(numeric_only=False) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT]) def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index b385091c9ff51..a06304af7a2d0 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -610,9 +610,8 @@ def test_subclassed_count(self): list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] ), ) - with tm.assert_produces_warning(FutureWarning): - result = df.count(level=1) - assert isinstance(result, tm.SubclassedDataFrame) + result = df.count() + assert isinstance(result, tm.SubclassedSeries) df = tm.SubclassedDataFrame() result = df.count() diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 6bd9b8af766c3..a7551af68bc2b 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -190,12 +190,6 @@ pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("count")), ), - pytest.param( - (pd.DataFrame, frame_mi_data, operator.methodcaller("count", level="A")), - marks=[ - pytest.mark.filterwarnings("ignore:Using the level keyword:FutureWarning"), - ], - ), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("nunique")), ), diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 034514cb0bcfb..f7af4892635bb 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -70,13 +70,11 @@ def raw_frame(multiindex_dataframe_random_data): @pytest.mark.parametrize("op", AGG_FUNCTIONS) -@pytest.mark.parametrize("level", [0, 1]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.filterwarnings("ignore:Using the level keyword:FutureWarning") @pytest.mark.filterwarnings("ignore:The default value of numeric_only:FutureWarning") -def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): +def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort): # GH6944 # GH 17537 # explicitly test the allowlist methods @@ -86,19 +84,14 @@ def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): frame = raw_frame.T if op in AGG_FUNCTIONS_WITH_SKIPNA: - grouped = frame.groupby(level=level, axis=axis, sort=sort) + grouped = frame.groupby("first", axis=axis, sort=sort) result = getattr(grouped, op)(skipna=skipna) - expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) - if sort: - expected = expected.sort_index(axis=axis, level=level) - tm.assert_frame_equal(result, expected) else: - grouped = frame.groupby(level=level, axis=axis, sort=sort) + grouped = frame.groupby("first", axis=axis, sort=sort) result = getattr(grouped, op)() - expected = getattr(frame, op)(level=level, axis=axis) - if sort: - expected = expected.sort_index(axis=axis, level=level) - tm.assert_frame_equal(result, expected) + # Previously compared to frame.op(level=...), but level removed in 2.0 + # TODO(GH 49629): Assert something better + assert isinstance(result, DataFrame) def test_groupby_blocklist(df_letters): diff --git a/pandas/tests/groupby/test_api_consistency.py b/pandas/tests/groupby/test_api_consistency.py index 155f86c23e106..bd29f29719494 100644 --- a/pandas/tests/groupby/test_api_consistency.py +++ b/pandas/tests/groupby/test_api_consistency.py @@ -37,19 +37,19 @@ def test_frame_consistency(request, groupby_func): # Some of these may be purposeful inconsistencies between the APIs exclude_expected, exclude_result = set(), set() if groupby_func in ("any", "all"): - exclude_expected = {"kwargs", "bool_only", "level", "axis"} + exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("count",): - exclude_expected = {"numeric_only", "level", "axis"} + exclude_expected = {"numeric_only", "axis"} elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} elif groupby_func in ("backfill", "bfill", "ffill", "pad"): exclude_expected = {"downcast", "inplace", "axis"} elif groupby_func in ("cummax", "cummin"): @@ -95,19 +95,17 @@ def test_series_consistency(request, groupby_func): # Some of these may be purposeful inconsistencies between the APIs exclude_expected, exclude_result = set(), set() if groupby_func in ("any", "all"): - exclude_expected = {"kwargs", "bool_only", "level", "axis"} - elif groupby_func in ("count",): - exclude_expected = {"level"} + exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("diff",): exclude_result = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "level", "skipna"} + exclude_expected = {"axis", "kwargs", "skipna"} elif groupby_func in ("backfill", "bfill", "ffill", "pad"): exclude_expected = {"downcast", "inplace", "axis"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 392910bd9e598..7fd52d3cf5bb8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1095,10 +1095,6 @@ def test_groupby_complex(): result = a.groupby(level=0).sum() tm.assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning): - result = a.sum(level=0) - tm.assert_series_equal(result, expected) - def test_groupby_complex_numbers(): # GH 17927 diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8888e2687621d..fb4cba5ea40b3 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -672,30 +672,6 @@ def test_empty(self, method, unit, use_bottleneck, dtype): result = getattr(s, method)(min_count=2) assert isna(result) - @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) - def test_empty_multi(self, method, unit): - s = Series( - [1, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]), - ) - # 1 / 0 by default - with tm.assert_produces_warning(FutureWarning): - result = getattr(s, method)(level=0) - expected = Series([1, unit], index=["a", "b"]) - tm.assert_series_equal(result, expected) - - # min_count=0 - with tm.assert_produces_warning(FutureWarning): - result = getattr(s, method)(level=0, min_count=0) - expected = Series([1, unit], index=["a", "b"]) - tm.assert_series_equal(result, expected) - - # min_count=1 - with tm.assert_produces_warning(FutureWarning): - result = getattr(s, method)(level=0, min_count=1) - expected = Series([1, np.nan], index=["a", "b"]) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("method", ["mean", "var"]) @pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"]) def test_ops_consistency_on_empty_nullable(self, method, dtype): @@ -932,7 +908,7 @@ def test_numpy_all_any(self, index_or_series): idx = Index([1, 2, 3]) assert np.all(idx) - def test_all_any_params(self): + def test_all_any_skipna(self): # Check skipna, with implicit 'object' dtype. s1 = Series([np.nan, True]) s2 = Series([np.nan, False]) @@ -941,20 +917,8 @@ def test_all_any_params(self): assert s2.any(skipna=False) assert not s2.any(skipna=True) - # Check level. + def test_all_any_bool_only(self): s = Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.all(level=0), Series([False, True, False])) - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(s.any(level=0), Series([False, True, True])) - - msg = "Option bool_only is not implemented with option level" - with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning): - s.any(bool_only=True, level=0) - with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning): - s.all(bool_only=True, level=0) # GH#47500 - test bool_only works assert s.any(bool_only=True) @@ -1009,22 +973,6 @@ def test_any_all_nullable_kleene_logic( result = getattr(ser, bool_agg_func)(skipna=skipna) assert (result is pd.NA and expected is pd.NA) or result == expected - @pytest.mark.parametrize( - "bool_agg_func,expected", - [("all", [False, True, False]), ("any", [False, True, True])], - ) - def test_any_all_boolean_level(self, bool_agg_func, expected): - # GH#33449 - ser = Series( - [False, False, True, True, False, True], - index=[0, 0, 1, 1, 2, 2], - dtype="boolean", - ) - with tm.assert_produces_warning(FutureWarning): - result = getattr(ser, bool_agg_func)(level=0) - expected = Series(expected, dtype="boolean") - tm.assert_series_equal(result, expected) - def test_any_axis1_bool_only(self): # GH#32432 df = DataFrame({"A": [True, False], "B": [1, 2]}) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index be40d7ca631eb..0dc68d78eebc9 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -256,14 +256,7 @@ def test_kurt(self): alt = lambda x: kurtosis(x, bias=False) self._check_stat_op("kurt", alt, string_series) - index = pd.MultiIndex( - levels=[["bar"], ["one", "two", "three"], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], - ) - s = Series(np.random.randn(6), index=index) - with tm.assert_produces_warning(FutureWarning): - tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"]) - + def test_kurt_corner(self): # test corner cases, kurt() returns NaN unless there's at least 4 # values min_N = 4 diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 29fb6aa32bc7c..dfc531f63614f 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -1,69 +1,13 @@ import numpy as np -import pytest import pandas as pd from pandas import ( Categorical, - MultiIndex, Series, ) -import pandas._testing as tm class TestSeriesCount: - def test_count_level_series(self): - index = MultiIndex( - levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], - codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], - ) - - ser = Series(np.random.randn(len(index)), index=index) - - with tm.assert_produces_warning(FutureWarning): - result = ser.count(level=0) - expected = ser.groupby(level=0).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - with tm.assert_produces_warning(FutureWarning): - result = ser.count(level=1) - expected = ser.groupby(level=1).count() - tm.assert_series_equal( - result.astype("f8"), expected.reindex(result.index).fillna(0) - ) - - def test_count_multiindex(self, series_with_multilevel_index): - ser = series_with_multilevel_index - - series = ser.copy() - series.index.names = ["a", "b"] - - with tm.assert_produces_warning(FutureWarning): - result = series.count(level="b") - with tm.assert_produces_warning(FutureWarning): - expect = ser.count(level=1).rename_axis("b") - tm.assert_series_equal(result, expect) - - with tm.assert_produces_warning(FutureWarning): - result = series.count(level="a") - with tm.assert_produces_warning(FutureWarning): - expect = ser.count(level=0).rename_axis("a") - tm.assert_series_equal(result, expect) - - msg = "Level x not found" - with pytest.raises(KeyError, match=msg): - with tm.assert_produces_warning(FutureWarning): - series.count("x") - - def test_count_level_without_multiindex(self): - ser = Series(range(3)) - - msg = "Series.count level is only valid with a MultiIndex" - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning): - ser.count(level=1) - def test_count(self, datetime_series): assert datetime_series.count() == len(datetime_series) @@ -71,18 +15,6 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) - ts = Series(np.arange(len(mi)), index=mi) - - with tm.assert_produces_warning(FutureWarning): - left = ts.count(level=1) - right = Series([2, 3, 1], index=[1, 2, np.nan]) - tm.assert_series_equal(left, right) - - ts.iloc[[0, 3, 5]] = np.nan - with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(ts.count(level=1), right - 1) - # GH#29478 with pd.option_context("use_inf_as_na", True): assert Series([pd.Timestamp("1990/1/1")]).count() == 1 diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index a552d9d84329f..e9d2877148c2b 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -2,10 +2,7 @@ import pytest import pandas as pd -from pandas import ( - MultiIndex, - Series, -) +from pandas import Series import pandas._testing as tm @@ -83,15 +80,6 @@ def test_prod_numpy16_bug(): assert not isinstance(result, Series) -def test_sum_with_level(): - obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) - - with tm.assert_produces_warning(FutureWarning): - result = obj.sum(level=0) - expected = Series([10.0], index=[2]) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}]) def test_validate_any_all_out_keepdims_raises(kwargs, func): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 7e428821a2d50..023411f486c6a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -15,8 +15,7 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): # axis=0 ymd = multiindex_year_month_day_dataframe_random_data - with tm.assert_produces_warning(FutureWarning): - month_sums = ymd.sum(level="month") + month_sums = ymd.groupby("month").sum() result = month_sums.reindex(ymd.index, level=1) expected = ymd.groupby(level="month").transform(np.sum) @@ -28,35 +27,11 @@ def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data): tm.assert_series_equal(result, expected, check_names=False) # axis=1 - with tm.assert_produces_warning(FutureWarning): - month_sums = ymd.T.sum(axis=1, level="month") + month_sums = ymd.T.groupby("month", axis=1).sum() result = month_sums.reindex(columns=ymd.index, level=1) expected = ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("opname", ["sub", "add", "mul", "div"]) - def test_binops_level( - self, opname, multiindex_year_month_day_dataframe_random_data - ): - ymd = multiindex_year_month_day_dataframe_random_data - - op = getattr(DataFrame, opname) - with tm.assert_produces_warning(FutureWarning): - month_sums = ymd.sum(level="month") - result = op(ymd, month_sums, level="month") - - broadcasted = ymd.groupby(level="month").transform(np.sum) - expected = op(ymd, broadcasted) - tm.assert_frame_equal(result, expected) - - # Series - op = getattr(Series, opname) - result = op(ymd["A"], month_sums["A"], level="month") - broadcasted = ymd["A"].groupby(level="month").transform(np.sum) - expected = op(ymd["A"], broadcasted) - expected.name = "A" - tm.assert_series_equal(result, expected) - def test_reindex(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -155,106 +130,6 @@ def test_alignment(self): exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) - @pytest.mark.parametrize("level", [0, 1]) - @pytest.mark.parametrize("skipna", [True, False]) - @pytest.mark.parametrize("sort", [True, False]) - def test_series_group_min_max( - self, all_numeric_reductions, level, skipna, sort, series_with_multilevel_index - ): - # GH 17537 - ser = series_with_multilevel_index - op = all_numeric_reductions - - grouped = ser.groupby(level=level, sort=sort) - # skipna=True - leftside = grouped.agg(lambda x: getattr(x, op)(skipna=skipna)) - with tm.assert_produces_warning(FutureWarning): - rightside = getattr(ser, op)(level=level, skipna=skipna) - if sort: - rightside = rightside.sort_index(level=level) - tm.assert_series_equal(leftside, rightside) - - @pytest.mark.parametrize("level", [0, 1]) - @pytest.mark.parametrize("axis", [0, 1]) - @pytest.mark.parametrize("skipna", [True, False]) - @pytest.mark.parametrize("sort", [True, False]) - def test_frame_group_ops( - self, - all_numeric_reductions, - level, - axis, - skipna, - sort, - multiindex_dataframe_random_data, - ): - # GH 17537 - frame = multiindex_dataframe_random_data - - frame.iloc[1, [1, 2]] = np.nan - frame.iloc[7, [0, 1]] = np.nan - - level_name = frame.index.names[level] - - if axis == 0: - frame = frame - else: - frame = frame.T - - grouped = frame.groupby(level=level, axis=axis, sort=sort) - - pieces = [] - op = all_numeric_reductions - - def aggf(x): - pieces.append(x) - return getattr(x, op)(skipna=skipna, axis=axis) - - leftside = grouped.agg(aggf) - with tm.assert_produces_warning(FutureWarning): - rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) - if sort: - rightside = rightside.sort_index(level=level, axis=axis) - frame = frame.sort_index(level=level, axis=axis) - - # for good measure, groupby detail - level_index = frame._get_axis(axis).levels[level].rename(level_name) - - tm.assert_index_equal(leftside._get_axis(axis), level_index) - tm.assert_index_equal(rightside._get_axis(axis), level_index) - - tm.assert_frame_equal(leftside, rightside) - - @pytest.mark.parametrize("meth", ["var", "std"]) - def test_std_var_pass_ddof(self, meth): - index = MultiIndex.from_arrays( - [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] - ) - df = DataFrame(np.random.randn(len(index), 5), index=index) - - ddof = 4 - alt = lambda x: getattr(x, meth)(ddof=ddof) - - with tm.assert_produces_warning(FutureWarning): - result = getattr(df[0], meth)(level=0, ddof=ddof) - expected = df[0].groupby(level=0).agg(alt) - tm.assert_series_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = getattr(df, meth)(level=0, ddof=ddof) - expected = df.groupby(level=0).agg(alt) - tm.assert_frame_equal(result, expected) - - def test_agg_multiple_levels( - self, multiindex_year_month_day_dataframe_random_data, frame_or_series - ): - ymd = multiindex_year_month_day_dataframe_random_data - ymd = tm.get_obj(ymd, frame_or_series) - - with tm.assert_produces_warning(FutureWarning): - result = ymd.sum(level=["year", "month"]) - expected = ymd.groupby(level=["year", "month"]).sum() - tm.assert_equal(result, expected) - def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data From a38a34f97510894112832acfb313205374b3c53a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 11 Nov 2022 13:30:14 -0500 Subject: [PATCH 36/39] DEPR: Enforce default of numeric_only=False in DataFrame methods (#49622) * DEPR: Enforce default of numeric_only=False * Remove unused functions * Add versionchanged * Add Series.rank to whatsnew * newline in docs --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/common.py | 64 ---------------- pandas/core/frame.py | 83 +++++++-------------- pandas/core/generic.py | 40 ++-------- pandas/tests/apply/test_frame_transform.py | 46 ++++-------- pandas/tests/apply/test_series_apply.py | 4 +- pandas/tests/frame/methods/test_cov_corr.py | 33 ++++---- pandas/tests/frame/methods/test_quantile.py | 31 -------- pandas/tests/frame/methods/test_rank.py | 15 ++-- pandas/tests/frame/test_reductions.py | 39 +--------- 10 files changed, 74 insertions(+), 283 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a2a20956e42bd..032bcf09244e5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -555,6 +555,8 @@ Removal of prior version deprecations/changes - Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`) - Changed behavior of :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` so that ``group_keys`` is respected even if a transformer is detected (:issue:`34998`) - Enforced deprecation ``numeric_only=None`` (the default) in DataFrame reductions that would silently drop columns that raised; ``numeric_only`` now defaults to ``False`` (:issue:`41480`) +- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) +- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index c73c31c2a103b..8764ee0ea6ed7 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -25,7 +25,6 @@ cast, overload, ) -import warnings import numpy as np @@ -37,7 +36,6 @@ RandomState, T, ) -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -631,65 +629,3 @@ def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]: list of column names with the None values replaced. """ return [f"level_{i}" if name is None else name for i, name in enumerate(names)] - - -def resolve_numeric_only(numeric_only: bool | None | lib.NoDefault) -> bool: - """Determine the Boolean value of numeric_only. - - See GH#46560 for details on the deprecation. - - Parameters - ---------- - numeric_only : bool, None, or lib.no_default - Value passed to the method. - - Returns - ------- - Resolved value of numeric_only. - """ - if numeric_only is lib.no_default: - # Methods that behave like numeric_only=True and only got the numeric_only - # arg in 1.5.0 default to lib.no_default - result = True - elif numeric_only is None: - # Methods that had the numeric_only arg prior to 1.5.0 and try all columns - # first default to None - result = False - else: - result = numeric_only - return result - - -def deprecate_numeric_only_default( - cls: type, name: str, deprecate_none: bool = False -) -> None: - """Emit FutureWarning message for deprecation of numeric_only. - - See GH#46560 for details on the deprecation. - - Parameters - ---------- - cls : type - pandas type that is generating the warning. - name : str - Name of the method that is generating the warning. - deprecate_none : bool, default False - Whether to also warn about the deprecation of specifying ``numeric_only=None``. - """ - if name in ["all", "any"]: - arg_name = "bool_only" - else: - arg_name = "numeric_only" - - msg = ( - f"The default value of {arg_name} in {cls.__name__}.{name} is " - "deprecated. In a future version, it will default to False. " - ) - if deprecate_none: - msg += f"In addition, specifying '{arg_name}=None' is deprecated. " - msg += ( - f"Select only valid columns or specify the value of {arg_name} to silence " - "this warning." - ) - - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 507e14c5616a2..efad2edddf360 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -132,7 +132,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_numeric_dtype, is_object_dtype, is_scalar, is_sequence, @@ -9938,7 +9937,7 @@ def corr( self, method: CorrelationMethod = "pearson", min_periods: int = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -9959,14 +9958,13 @@ def corr( Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 - .. deprecated:: 1.5.0 - The default value of ``numeric_only`` will be ``False`` in a future - version of pandas. + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. Returns ------- @@ -10006,11 +10004,7 @@ def corr( dogs 1.0 NaN cats NaN 1.0 """ # noqa:E501 - numeric_only_bool = com.resolve_numeric_only(numeric_only) - data = self._get_numeric_data() if numeric_only_bool else self - if numeric_only is lib.no_default and len(data.columns) < len(self.columns): - com.deprecate_numeric_only_default(type(self), "corr") - + data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -10057,7 +10051,7 @@ def cov( self, min_periods: int | None = None, ddof: int | None = 1, - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -10089,14 +10083,13 @@ def cov( .. versionadded:: 1.1.0 - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 - .. deprecated:: 1.5.0 - The default value of ``numeric_only`` will be ``False`` in a future - version of pandas. + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. Returns ------- @@ -10167,11 +10160,7 @@ def cov( b NaN 1.248003 0.191417 c -0.150812 0.191417 0.895202 """ - numeric_only_bool = com.resolve_numeric_only(numeric_only) - data = self._get_numeric_data() if numeric_only_bool else self - if numeric_only is lib.no_default and len(data.columns) < len(self.columns): - com.deprecate_numeric_only_default(type(self), "cov") - + data = self._get_numeric_data() if numeric_only else self cols = data.columns idx = cols.copy() mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) @@ -10195,7 +10184,7 @@ def corrwith( axis: Axis = 0, drop: bool = False, method: CorrelationMethod = "pearson", - numeric_only: bool | lib.NoDefault = lib.no_default, + numeric_only: bool = False, ) -> Series: """ Compute pairwise correlation. @@ -10223,14 +10212,13 @@ def corrwith( * callable: callable with input two 1d ndarrays and returning a float. - numeric_only : bool, default True + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. .. versionadded:: 1.5.0 - .. deprecated:: 1.5.0 - The default value of ``numeric_only`` will be ``False`` in a future - version of pandas. + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. Returns ------- @@ -10263,15 +10251,12 @@ def corrwith( dtype: float64 """ # noqa:E501 axis = self._get_axis_number(axis) - numeric_only_bool = com.resolve_numeric_only(numeric_only) - this = self._get_numeric_data() if numeric_only_bool else self - if numeric_only is lib.no_default and len(this.columns) < len(self.columns): - com.deprecate_numeric_only_default(type(self), "corrwith") + this = self._get_numeric_data() if numeric_only else self if isinstance(other, Series): return this.apply(lambda x: other.corr(x, method=method), axis=axis) - if numeric_only_bool: + if numeric_only: other = other._get_numeric_data() left, right = this.align(other, join="inner", copy=False) @@ -10285,14 +10270,14 @@ def corrwith( right = right + left * 0 # demeaned data - ldem = left - left.mean(numeric_only=numeric_only_bool) - rdem = right - right.mean(numeric_only=numeric_only_bool) + ldem = left - left.mean(numeric_only=numeric_only) + rdem = right - right.mean(numeric_only=numeric_only) num = (ldem * rdem).sum() dom = ( (left.count() - 1) - * left.std(numeric_only=numeric_only_bool) - * right.std(numeric_only=numeric_only_bool) + * left.std(numeric_only=numeric_only) + * right.std(numeric_only=numeric_only) ) correl = num / dom @@ -10484,12 +10469,6 @@ def _get_data() -> DataFrame: # float64, see test_apply_funcs_over_empty out = out.astype(np.float64) - if numeric_only is None and out.shape[0] != df.shape[1]: - # columns have been dropped GH#41480 - com.deprecate_numeric_only_default( - type(self), name, deprecate_none=True - ) - return out assert not numeric_only and axis == 1 @@ -10739,7 +10718,7 @@ def quantile( self, q: float = ..., axis: Axis = ..., - numeric_only: bool | lib.NoDefault = ..., + numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., ) -> Series: ... @@ -10749,7 +10728,7 @@ def quantile( self, q: AnyArrayLike | Sequence[float], axis: Axis = ..., - numeric_only: bool | lib.NoDefault = ..., + numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., ) -> Series | DataFrame: ... @@ -10759,7 +10738,7 @@ def quantile( self, q: float | AnyArrayLike | Sequence[float] = ..., axis: Axis = ..., - numeric_only: bool | lib.NoDefault = ..., + numeric_only: bool = ..., interpolation: QuantileInterpolation = ..., ) -> Series | DataFrame: ... @@ -10768,7 +10747,7 @@ def quantile( self, q: float | AnyArrayLike | Sequence[float] = 0.5, axis: Axis = 0, - numeric_only: bool | lib.NoDefault = no_default, + numeric_only: bool = False, interpolation: QuantileInterpolation = "linear", method: Literal["single", "table"] = "single", ) -> Series | DataFrame: @@ -10781,13 +10760,11 @@ def quantile( Value between 0 <= q <= 1, the quantile(s) to compute. axis : {0 or 'index', 1 or 'columns'}, default 0 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - numeric_only : bool, default True - If False, the quantile of datetime and timedelta data will be - computed as well. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. - .. deprecated:: 1.5.0 - The default value of ``numeric_only`` will be ``False`` in a future - version of pandas. + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} This optional parameter specifies the interpolation method to use, @@ -10859,10 +10836,6 @@ def quantile( """ validate_percentile(q) axis = self._get_axis_number(axis) - any_not_numeric = any(not is_numeric_dtype(x) for x in self.dtypes) - if numeric_only is no_default and any_not_numeric: - com.deprecate_numeric_only_default(type(self), "quantile") - numeric_only = com.resolve_numeric_only(numeric_only) if not is_list_like(q): # BlockManager.quantile expects listlike, so we wrap and unwrap here diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fa6ede80b6676..b55bee3175744 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8854,7 +8854,7 @@ def rank( self: NDFrameT, axis: Axis = 0, method: str = "average", - numeric_only: bool_t | None | lib.NoDefault = lib.no_default, + numeric_only: bool_t = False, na_option: str = "keep", ascending: bool_t = True, pct: bool_t = False, @@ -8879,8 +8879,12 @@ def rank( * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, optional + numeric_only : bool, default False For DataFrame objects, rank only numeric columns if set to True. + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. + na_option : {'keep', 'top', 'bottom'}, default 'keep' How to rank NaN values: @@ -8954,20 +8958,6 @@ def rank( 3 spider 8.0 4.0 4.0 4.0 1.000 4 snake NaN NaN NaN 5.0 NaN """ - warned = False - if numeric_only is None: - # GH#45036 - warnings.warn( - f"'numeric_only=None' in {type(self).__name__}.rank is deprecated " - "and will raise in a future version. Pass either 'True' or " - "'False'. 'False' will be the default.", - FutureWarning, - stacklevel=find_stack_level(), - ) - warned = True - elif numeric_only is lib.no_default: - numeric_only = None - axis_int = self._get_axis_number(axis) if na_option not in {"keep", "top", "bottom"}: @@ -9003,24 +8993,6 @@ def ranker(data): ranks_obj = self._constructor(ranks, **data._construct_axes_dict()) return ranks_obj.__finalize__(self, method="rank") - # if numeric_only is None, and we can't get anything, we try with - # numeric_only=True - if numeric_only is None: - try: - return ranker(self) - except TypeError: - numeric_only = True - if not warned: - # Only warn here if we didn't already issue a warning above - # GH#45036 - warnings.warn( - f"Dropping of nuisance columns in {type(self).__name__}.rank " - "is deprecated; in a future version this will raise TypeError. " - "Select only valid columns before calling rank.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if numeric_only: if self.ndim == 1 and not is_numeric_dtype(self.dtype): # GH#47500 diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 73a52534dd0d2..8e385de0b48e0 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -130,10 +130,6 @@ def func(x): frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail] -@pytest.mark.filterwarnings( - "ignore:Calling Series.rank with numeric_only:FutureWarning" -) -@pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1]) def test_transform_bad_dtype(op, frame_or_series, request): # GH 35964 @@ -144,17 +140,13 @@ def test_transform_bad_dtype(op, frame_or_series, request): obj = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms obj = tm.get_obj(obj, frame_or_series) - if op == "rank": - error = ValueError - msg = "Transform function failed" - else: - error = TypeError - msg = "|".join( - [ - "not supported between instances of 'type' and 'type'", - "unsupported operand type", - ] - ) + error = TypeError + msg = "|".join( + [ + "not supported between instances of 'type' and 'type'", + "unsupported operand type", + ] + ) with pytest.raises(error, match=msg): obj.transform(op) @@ -166,12 +158,6 @@ def test_transform_bad_dtype(op, frame_or_series, request): obj.transform({"A": [op]}) -@pytest.mark.filterwarnings( - "ignore:Dropping of nuisance columns in Series.rank:FutureWarning" -) -@pytest.mark.filterwarnings( - "ignore:Calling Series.rank with numeric_only:FutureWarning" -) @pytest.mark.parametrize("op", frame_kernels_raise) def test_transform_failure_typeerror(request, op): # GH 35964 @@ -183,17 +169,13 @@ def test_transform_failure_typeerror(request, op): # Using object makes most transform kernels fail df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) - if op == "rank": - error = ValueError - msg = "Transform function failed" - else: - error = TypeError - msg = "|".join( - [ - "not supported between instances of 'type' and 'type'", - "unsupported operand type", - ] - ) + error = TypeError + msg = "|".join( + [ + "not supported between instances of 'type' and 'type'", + "unsupported operand type", + ] + ) with pytest.raises(error, match=msg): df.transform([op]) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index e0d3510ac3865..5986f1f6cf51d 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -275,8 +275,6 @@ def test_transform(string_series): tm.assert_series_equal(result.reindex_like(expected), expected) -@pytest.mark.filterwarnings("ignore:Calling Series.rank:FutureWarning") -@pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("op", series_transform_kernels) def test_transform_partial_failure(op, request): # GH 35964 @@ -288,7 +286,7 @@ def test_transform_partial_failure(op, request): # Using object makes most transform kernels fail ser = Series(3 * [object]) - if op in ("fillna", "ngroup", "rank"): + if op in ("fillna", "ngroup"): error = ValueError msg = "Transform function failed" else: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 445b90327ed2c..d7333ce03c215 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -40,11 +40,10 @@ def test_cov(self, float_frame, float_string_frame): expected = frame["A"].cov(frame["C"]) tm.assert_almost_equal(result["A"]["C"], expected) - # exclude non-numeric types - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = float_string_frame.cov() + # fails on non-numeric types + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.cov() + result = float_string_frame.cov(numeric_only=True) expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) @@ -118,11 +117,9 @@ def test_corr_scipy_method(self, float_frame, method): # --------------------------------------------------------------------- def test_corr_non_numeric(self, float_string_frame): - # exclude non-numeric types - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = float_string_frame.corr() + with pytest.raises(ValueError, match="could not convert string to float"): + float_string_frame.corr() + result = float_string_frame.corr(numeric_only=True) expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @@ -218,7 +215,7 @@ def test_corr_item_cache(self): ser = df["A"] # populate item_cache assert len(df._mgr.arrays) == 2 # i.e. 2 blocks - _ = df.corr() + _ = df.corr(numeric_only=True) # Check that the corr didn't break link between ser and df ser.values[0] = 99 @@ -313,17 +310,15 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = df1.corrwith(df2) + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) + result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) - with tm.assert_produces_warning( - FutureWarning, match="The default value of numeric_only" - ): - result = df1.corrwith(df2, axis=1) + with pytest.raises(TypeError, match="unsupported operand type"): + df1.corrwith(df2, axis=1) + result = df1.corrwith(df2, axis=1, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index b4661a92c8275..8096af757d3cf 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -25,37 +25,6 @@ def interp_method(request): class TestDataFrameQuantile: - @pytest.mark.parametrize( - "non_num_col", - [ - pd.date_range("2014-01-01", periods=3, freq="m"), - ["a", "b", "c"], - [DataFrame, Series, Timestamp], - ], - ) - def test_numeric_only_default_false_warning( - self, non_num_col, interp_method, request, using_array_manager - ): - # GH #7308 - interpolation, method = interp_method - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}) - df["C"] = non_num_col - - expected = Series( - [2.0, 3.0], - index=["A", "B"], - name=0.5, - ) - if interpolation == "nearest": - expected = expected.astype(np.int64) - if method == "table" and using_array_manager: - request.node.add_marker( - pytest.mark.xfail(reason="Axis name incorrectly set.") - ) - with tm.assert_produces_warning(FutureWarning, match="numeric_only"): - result = df.quantile(0.5, interpolation=interpolation, method=method) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( "df,expected", [ diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 1f5cb95885004..5f648c76d0aa4 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -136,12 +136,9 @@ def test_rank_mixed_frame(self, float_string_frame): float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) - with tm.assert_produces_warning(FutureWarning, match="numeric_only=None"): - float_string_frame.rank(numeric_only=None) - with tm.assert_produces_warning(FutureWarning, match="Dropping of nuisance"): - result = float_string_frame.rank(1) - expected = float_string_frame.rank(1, numeric_only=True) - tm.assert_frame_equal(result, expected) + float_string_frame.rank(numeric_only=False) + with pytest.raises(TypeError, match="not supported between instances of"): + float_string_frame.rank(axis=1) @td.skip_if_no_scipy def test_rank_na_option(self, float_frame): @@ -491,7 +488,7 @@ def test_rank_object_first(self, frame_or_series, na_option, ascending, expected ) def test_rank_mixed_axis_zero(self, data, expected): df = DataFrame(data) - msg = "Dropping of nuisance columns" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.rank() + with pytest.raises(TypeError, match="'<' not supported between instances of"): + df.rank() + result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b7474060a7e8a..fb85978921393 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1,13 +1,11 @@ from datetime import timedelta from decimal import Decimal -import inspect import re from dateutil.tz import tzlocal import numpy as np import pytest -from pandas._libs import lib from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -1623,40 +1621,7 @@ def test_reduction_axis_none_deprecation(method): [ "corr", "corrwith", - "count", "cov", - "mode", - "quantile", - ], -) -def test_numeric_only_deprecation(kernel): - # GH#46852 - df = DataFrame({"a": [1, 2, 3], "b": object}) - args = (df,) if kernel == "corrwith" else () - signature = inspect.signature(getattr(DataFrame, kernel)) - default = signature.parameters["numeric_only"].default - assert default is not True - - if default is None or default is lib.no_default: - expected = getattr(df[["a"]], kernel)(*args) - warn = FutureWarning - else: - # default must be False and works on any nuisance columns - expected = getattr(df, kernel)(*args) - if kernel == "mode": - assert "b" in expected.columns - else: - assert "b" in expected.index - warn = None - msg = f"The default value of numeric_only in DataFrame.{kernel}" - with tm.assert_produces_warning(warn, match=msg): - result = getattr(df, kernel)(*args) - tm.assert_equal(result, expected) - - -@pytest.mark.parametrize( - "kernel", - [ "idxmax", "idxmin", "kurt", @@ -1665,6 +1630,7 @@ def test_numeric_only_deprecation(kernel): "median", "min", "prod", + "quantile", "sem", "skew", "std", @@ -1675,6 +1641,7 @@ def test_numeric_only_deprecation(kernel): def test_fails_on_non_numeric(kernel): # GH#46852 df = DataFrame({"a": [1, 2, 3], "b": object}) + args = (df,) if kernel == "corrwith" else () msg = "|".join( [ "not allowed for this dtype", @@ -1685,4 +1652,4 @@ def test_fails_on_non_numeric(kernel): ] ) with pytest.raises(TypeError, match=msg): - getattr(df, kernel)() + getattr(df, kernel)(*args) From 0541b8931b25c741bc6284eb76faa0484187e9b7 Mon Sep 17 00:00:00 2001 From: Thierry Moisan Date: Fri, 11 Nov 2022 15:53:58 -0500 Subject: [PATCH 37/39] STYLE: fix pylint reimported warnings (#49645) * STYLE: fix pylint reimported warnings * fixup! STYLE: fix pylint reimported warnings --- pandas/_testing/__init__.py | 2 -- pandas/_testing/contexts.py | 1 - pandas/core/generic.py | 2 -- pandas/io/sql.py | 1 - pandas/plotting/_matplotlib/core.py | 2 -- pandas/tests/groupby/test_grouping.py | 2 -- pandas/tests/indexing/test_loc.py | 6 +----- pandas/tests/io/excel/test_readers.py | 3 --- pandas/tests/io/formats/test_format.py | 17 ++++------------- pandas/tests/io/json/test_normalize.py | 2 +- pandas/tests/reductions/test_reductions.py | 2 -- pandas/tests/reshape/concat/test_datetimes.py | 2 -- .../tests/series/accessors/test_dt_accessor.py | 6 ------ .../offsets/test_custom_business_month.py | 2 -- pyproject.toml | 1 - 15 files changed, 6 insertions(+), 45 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 6cce1137e707b..64f5d97f588c5 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -610,8 +610,6 @@ def makeCustomIndex( for i in range(nlevels): def keyfunc(x): - import re - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") return [int(num) for num in numeric_tuple] diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index ee94c1d3aae0c..e5f716c62eca7 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -62,7 +62,6 @@ def set_timezone(tz: str) -> Generator[None, None, None]: ... 'EST' """ - import os import time def setTZ(tz) -> None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b55bee3175744..6288ebe77c8c0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9018,8 +9018,6 @@ def compare( keep_equal: bool_t = False, result_names: Suffixes = ("self", "other"), ): - from pandas.core.reshape.concat import concat - if type(self) is not type(other): cls_self, cls_other = type(self).__name__, type(other).__name__ raise TypeError( diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 591fa25bd36d1..a7d1da69e2729 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -741,7 +741,6 @@ def pandasSQL_builder(con, schema: str | None = None) -> SQLDatabase | SQLiteDat provided parameters. """ import sqlite3 - import warnings if isinstance(con, sqlite3.Connection) or con is None: return SQLiteDatabase(con) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index dea5dbd33bbdf..27603f7d987d2 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1655,8 +1655,6 @@ def _start_base(self): return self.bottom def _make_plot(self) -> None: - import matplotlib as mpl - colors = self._get_colors() ncolors = len(colors) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e3b7ad8f78750..9659b4aa5f45c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -393,8 +393,6 @@ def test_groupby_grouper(self, df): def test_groupby_dict_mapping(self): # GH #679 - from pandas import Series - s = Series({"T1": 5}) result = s.groupby({"T1": "T2"}).agg(sum) expected = s.groupby(["T2"]).agg(sum) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 0068a0a0ded67..26eb7532adfa4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -953,11 +953,7 @@ def test_loc_coercion(self): def test_loc_coercion2(self): # GH#12045 - import datetime - - df = DataFrame( - {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} - ) + df = DataFrame({"date": [datetime(2012, 1, 1), datetime(1012, 1, 2)]}) expected = df.dtypes result = df.iloc[[0]] diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index ee2a8f518cd56..bff4c98fe2842 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -882,10 +882,7 @@ def test_read_from_file_url(self, read_ext, datapath): tm.assert_frame_equal(url_table, local_table) def test_read_from_pathlib_path(self, read_ext): - # GH12655 - from pathlib import Path - str_path = "test1" + read_ext expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f870ef25991df..640c686bb56ca 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -5,6 +5,7 @@ from datetime import ( datetime, time, + timedelta, ) from io import StringIO import itertools @@ -994,12 +995,10 @@ def test_truncate_with_different_dtypes(self): # when truncated the dtypes of the splits can differ # 11594 - import datetime - s = Series( - [datetime.datetime(2012, 1, 1)] * 10 - + [datetime.datetime(1012, 1, 2)] - + [datetime.datetime(2012, 1, 3)] * 10 + [datetime(2012, 1, 1)] * 10 + + [datetime(1012, 1, 2)] + + [datetime(2012, 1, 3)] * 10 ) with option_context("display.max_rows", 8): @@ -1250,8 +1249,6 @@ def test_long_series(self): dtype="int64", ) - import re - str_rep = str(s) nmatches = len(re.findall("dtype", str_rep)) assert nmatches == 1 @@ -2445,12 +2442,6 @@ def test_datetimeindex_highprecision(self, start_date): assert start_date in result def test_timedelta64(self): - - from datetime import ( - datetime, - timedelta, - ) - Series(np.array([1100, 20], dtype="timedelta64[ns]")).to_string() s = Series(date_range("2012-1-1", periods=3, freq="D")) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 231228ef6c0af..986c0039715a6 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -866,7 +866,7 @@ def test_with_large_max_level(self): def test_deprecated_import(self): with tm.assert_produces_warning(FutureWarning): - from pandas.io.json import json_normalize + from pandas.io.json import json_normalize # pylint: disable=reimported recs = [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}] json_normalize(recs) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index fb4cba5ea40b3..d8b9082ec318a 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -866,8 +866,6 @@ def test_idxmax(self): allna = string_series * np.nan assert isna(allna.idxmax()) - from pandas import date_range - s = Series(date_range("20130102", periods=6)) result = s.idxmax() assert result == 5 diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 93d212d0a581d..f16358813488e 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -22,8 +22,6 @@ class TestDatetimeConcat: def test_concat_datetime64_block(self): - from pandas.core.indexes.datetimes import date_range - rng = date_range("1/1/2000", periods=10) df = DataFrame({"time": rng}) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 1e929cd43842b..689c8ba845a6c 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -632,12 +632,6 @@ def test_strftime_all_nat(self, data): tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): - - from datetime import ( - date, - time, - ) - # GH 8689 ser = Series(date_range("20130101", periods=5, freq="D")) ser.iloc[2] = pd.NaT diff --git a/pandas/tests/tseries/offsets/test_custom_business_month.py b/pandas/tests/tseries/offsets/test_custom_business_month.py index 36c690e89256d..bc9f7f3f511b8 100644 --- a/pandas/tests/tseries/offsets/test_custom_business_month.py +++ b/pandas/tests/tseries/offsets/test_custom_business_month.py @@ -400,8 +400,6 @@ def test_holidays(self): @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") def test_datetimeindex(self): - from pandas.tseries.holiday import USFederalHolidayCalendar - hcal = USFederalHolidayCalendar() freq = CBMonthEnd(calendar=hcal) diff --git a/pyproject.toml b/pyproject.toml index 71b1f44dbff6f..b436b29c03c84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,7 +153,6 @@ disable = [ "raise-missing-from", "redefined-builtin", "redefined-outer-name", - "reimported", "self-assigning-variable", "self-cls-assignment", "signature-differs", From 133c5f043ac673fe09b8df18000e04f8702b696b Mon Sep 17 00:00:00 2001 From: codamuse Date: Fri, 11 Nov 2022 22:46:28 -0500 Subject: [PATCH 38/39] remove vestiges of MultiIndex grouper --- pandas/core/groupby/grouper.py | 20 ++------------------ pandas/core/indexes/base.py | 24 +++++------------------- 2 files changed, 7 insertions(+), 37 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 688dcb44c31f3..d363225fc036b 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -480,11 +480,7 @@ def __init__( index_level = index.get_level_values(ilevel) else: index_level = index - ( - self.grouping_vector, # Index - self._codes, - self._group_index, - ) = index_level._get_grouper_for_level(mapper, dropna=dropna) + self.grouping_index = index_level._get_grouper_for_level(mapper) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -600,10 +596,6 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @property def codes(self) -> npt.NDArray[np.signedinteger]: - if self._codes is not None: - # _codes is set in __init__ for MultiIndex cases - return self._codes - return self._codes_and_uniques[0] @cache_readonly @@ -612,11 +604,7 @@ def group_arraylike(self) -> ArrayLike: Analogous to result_index, but holding an ArrayLike to ensure we can retain ExtensionDtypes. """ - if self._group_index is not None: - # _group_index is set in __init__ for MultiIndex cases - return self._group_index._values - - elif self._all_grouper is not None: + if self._all_grouper is not None: # retain dtype for categories, including unobserved ones return self.result_index._values @@ -636,10 +624,6 @@ def result_index(self) -> Index: @cache_readonly def group_index(self) -> Index: - if self._group_index is not None: - # _group_index is set in __init__ for MultiIndex cases - return self._group_index - uniques = self._codes_and_uniques[1] return Index._with_infer(uniques, name=self.name) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e7b1ad1a48bd6..d07d451fe7383 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2107,42 +2107,28 @@ def _drop_level_numbers(self, levnums: list[int]): @final def _get_grouper_for_level( self, - mapper, - *, - level=None, - dropna: bool = True, - ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: + mapper + ) -> Index: """ - Get index grouper corresponding to an index level + Get index grouper Parameters ---------- mapper: Group mapping function or None Function mapping index values to groups - level : int or None - Index level, positional - dropna : bool - dropna from groupby Returns ------- grouper : Index Index of values to group on. - labels : ndarray of int or None - Array of locations in level_index. - uniques : Index or None - Index of unique values for level. """ if self._is_multi: raise NotImplementedError("Index grouper isn't supported for MultiIndex") - assert level is None or level == 0 if mapper is None: - grouper = self + return self else: - grouper = self.map(mapper) - - return grouper, None, None + return self.map(mapper) # -------------------------------------------------------------------- # Introspection Methods From 338b8f67987e74bc15204247ceab6f6583093566 Mon Sep 17 00:00:00 2001 From: codamuse Date: Fri, 11 Nov 2022 23:41:46 -0500 Subject: [PATCH 39/39] revert broader changes on CI fails --- pandas/core/groupby/grouper.py | 22 +++++++++++++++++++--- pandas/core/indexes/base.py | 10 ++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d363225fc036b..d7fce42cff7b7 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -480,7 +480,11 @@ def __init__( index_level = index.get_level_values(ilevel) else: index_level = index - self.grouping_index = index_level._get_grouper_for_level(mapper) + ( + self.grouping_vector, # Index + self._codes, + self._group_index, + ) = index_level._get_grouper_for_level(mapper) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -596,6 +600,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: @property def codes(self) -> npt.NDArray[np.signedinteger]: + if self._codes is not None: + # _codes is set in __init__ for MultiIndex cases + return self._codes + return self._codes_and_uniques[0] @cache_readonly @@ -604,7 +612,11 @@ def group_arraylike(self) -> ArrayLike: Analogous to result_index, but holding an ArrayLike to ensure we can retain ExtensionDtypes. """ - if self._all_grouper is not None: + if self._group_index is not None: + # _group_index is set in __init__ for MultiIndex cases + return self._group_index._values + + elif self._all_grouper is not None: # retain dtype for categories, including unobserved ones return self.result_index._values @@ -624,6 +636,10 @@ def result_index(self) -> Index: @cache_readonly def group_index(self) -> Index: + if self._group_index is not None: + # _group_index is set in __init__ for MultiIndex cases + return self._group_index + uniques = self._codes_and_uniques[1] return Index._with_infer(uniques, name=self.name) @@ -639,7 +655,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] - if self._sort: + if self._sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d07d451fe7383..f6a505870ce17 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2108,9 +2108,9 @@ def _drop_level_numbers(self, levnums: list[int]): def _get_grouper_for_level( self, mapper - ) -> Index: + ) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]: """ - Get index grouper + Get index grouper corresponding to an index level Parameters ---------- @@ -2126,9 +2126,11 @@ def _get_grouper_for_level( raise NotImplementedError("Index grouper isn't supported for MultiIndex") if mapper is None: - return self + grouper = self else: - return self.map(mapper) + grouper = self.map(mapper) + + return grouper, None, None # -------------------------------------------------------------------- # Introspection Methods