From 869df9ed13afda995fa20e3bfb1294294658eb45 Mon Sep 17 00:00:00 2001 From: jmarin Date: Mon, 26 Feb 2024 23:15:18 +0100 Subject: [PATCH 1/8] Ensure dataframe preserves categorical index in constructor with categorical series --- pandas/core/frame.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5d424b15e69e..9ebdc230bc9f8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -808,6 +808,21 @@ def __init__( if len(data) > 0: if is_dataclass(data[0]): data = dataclasses_to_dicts(data) + + # Check if all elements in data are Series with categorical indices + if all( + isinstance(item, Series) + and isinstance(item.index, pandas.CategoricalIndex) + for item in data + ): + all_categorical = True + # Combine all categories + categories = pandas.CategoricalIndex( + np.unique(np.concatenate([s.index.categories for s in data])) + ) + else: + all_categorical = False + if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: @@ -820,6 +835,13 @@ def __init__( index, # type: ignore[arg-type] dtype, ) + + if all_categorical: + # Ensure columns are CategoricalIndex + columns = pandas.CategoricalIndex( + columns, categories=categories, ordered=True + ) + mgr = arrays_to_mgr( arrays, columns, From 2e4fb2f94677d6651cc10b11e34881198a5be9e0 Mon Sep 17 00:00:00 2001 From: jmarin Date: Sat, 2 Mar 2024 01:10:00 +0100 Subject: [PATCH 2/8] Modify union to properly handle categoricalIndex --- pandas/core/frame.py | 20 -------------------- pandas/core/indexes/base.py | 13 ++++++++++++- pandas/tests/frame/test_constructors.py | 2 +- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ebdc230bc9f8..fa29af99c9cce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -809,20 +809,6 @@ def __init__( if is_dataclass(data[0]): data = dataclasses_to_dicts(data) - # Check if all elements in data are Series with categorical indices - if all( - isinstance(item, Series) - and isinstance(item.index, pandas.CategoricalIndex) - for item in data - ): - all_categorical = True - # Combine all categories - categories = pandas.CategoricalIndex( - np.unique(np.concatenate([s.index.categories for s in data])) - ) - else: - all_categorical = False - if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: @@ -836,12 +822,6 @@ def __init__( dtype, ) - if all_categorical: - # Ensure columns are CategoricalIndex - columns = pandas.CategoricalIndex( - columns, categories=categories, ordered=True - ) - mgr = arrays_to_mgr( arrays, columns, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c17e01b85fa84..bd497203c9f02 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -212,6 +212,7 @@ PeriodArray, ) + __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) @@ -2912,7 +2913,17 @@ def union(self, other, sort=None): self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) - if self.dtype != other.dtype: + if isinstance(self.dtype, CategoricalDtype) and isinstance( + other.dtype, CategoricalDtype + ): + # Unite both categories + both_categories = np.union1d(self.categories, other.categories) + + # Convert both indexes to have the same categories + self = self.set_categories(both_categories) + other = other.set_categories(both_categories) + + elif self.dtype != other.dtype: if ( isinstance(self, ABCMultiIndex) and not is_object_dtype(_unpack_nested_dtype(other)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7d1a5b4492740..20c940ad0c6dd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2349,7 +2349,7 @@ def test_construct_with_two_categoricalindex_series(self): result = DataFrame([s1, s2]) expected = DataFrame( np.array([[39, 6, 4, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]), - columns=["female", "male", "unknown", "f", "m"], + columns=CategoricalIndex(["female", "male", "unknown", "f", "m"]), ) tm.assert_frame_equal(result, expected) From 4ebc935dbe4f3a5cba4d6ecc8bfc43feec893654 Mon Sep 17 00:00:00 2001 From: jmarin Date: Sat, 2 Mar 2024 01:10:00 +0100 Subject: [PATCH 3/8] Modify union to properly handle categoricalIndex --- pandas/core/frame.py | 20 -------------------- pandas/core/indexes/base.py | 16 ++++++++++++++-- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/indexes/test_setops.py | 4 ++-- pandas/tests/reshape/concat/test_append.py | 2 ++ 6 files changed, 20 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ebdc230bc9f8..fa29af99c9cce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -809,20 +809,6 @@ def __init__( if is_dataclass(data[0]): data = dataclasses_to_dicts(data) - # Check if all elements in data are Series with categorical indices - if all( - isinstance(item, Series) - and isinstance(item.index, pandas.CategoricalIndex) - for item in data - ): - all_categorical = True - # Combine all categories - categories = pandas.CategoricalIndex( - np.unique(np.concatenate([s.index.categories for s in data])) - ) - else: - all_categorical = False - if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: @@ -836,12 +822,6 @@ def __init__( dtype, ) - if all_categorical: - # Ensure columns are CategoricalIndex - columns = pandas.CategoricalIndex( - columns, categories=categories, ordered=True - ) - mgr = arrays_to_mgr( arrays, columns, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c17e01b85fa84..f446f3a55bd0f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -212,6 +212,7 @@ PeriodArray, ) + __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) @@ -2912,7 +2913,18 @@ def union(self, other, sort=None): self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) - if self.dtype != other.dtype: + if isinstance(self.dtype, CategoricalDtype) and isinstance( + other.dtype, CategoricalDtype + ): + # Unite both categories + both_categories = np.union1d(self.categories, other.categories) + # if ordered and unordered, we set categories to be unordered + ordered = False if self.ordered != other.ordered else None + # Convert both indexes to have the same categories + self = self.set_categories(both_categories, ordered=ordered) + other = other.set_categories(both_categories, ordered=ordered) + + elif self.dtype != other.dtype: if ( isinstance(self, ABCMultiIndex) and not is_object_dtype(_unpack_nested_dtype(other)) @@ -3006,7 +3018,7 @@ def _union(self, other: Index, sort: bool | None): else: missing = algos.unique1d(self.get_indexer_non_unique(other)[1]) - result: Index | MultiIndex | ArrayLike + result: Index | MultiIndex | CategoricalIndex | ArrayLike if self._is_multi: # Preserve MultiIndex to avoid losing dtypes result = self.append(other.take(missing)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7d1a5b4492740..20c940ad0c6dd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2349,7 +2349,7 @@ def test_construct_with_two_categoricalindex_series(self): result = DataFrame([s1, s2]) expected = DataFrame( np.array([[39, 6, 4, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]), - columns=["female", "male", "unknown", "f", "m"], + columns=CategoricalIndex(["female", "male", "unknown", "f", "m"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 10eca5ea8427f..c36e100aa0a0f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -714,7 +714,7 @@ def test_describe_categorical_columns(): # GH 11558 cats = CategoricalIndex( ["qux", "foo", "baz", "bar"], - categories=["foo", "bar", "baz", "qux"], + categories=["bar", "baz", "foo", "qux"], ordered=True, ) df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 27b54ea66f0ac..6fae11781e623 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -573,7 +573,7 @@ def test_union_duplicate_index_subsets_of_each_other( expected = Index([1, 2, 2, 3, 3, 4], dtype=dtype) if isinstance(a, CategoricalIndex): - expected = Index([1, 2, 2, 3, 3, 4]) + expected = CategoricalIndex([1, 2, 2, 3, 3, 4]) result = a.union(b) tm.assert_index_equal(result, expected) result = a.union(b, sort=False) @@ -670,7 +670,7 @@ def test_union_with_duplicate_index_not_subset_and_non_monotonic( b = Index([0, 0, 1], dtype=dtype) expected = Index([0, 0, 1, 2], dtype=dtype) if isinstance(a, CategoricalIndex): - expected = Index([0, 0, 1, 2]) + expected = CategoricalIndex([0, 0, 1, 2]) result = a.union(b) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 3fb6a3fb61396..96ca06e1d16a4 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -234,6 +234,8 @@ def test_append_different_columns_types(self, df_columns, series_index): result = df._append(ser) idx_diff = ser.index.difference(df_columns) combined_columns = Index(df_columns.tolist()).append(idx_diff) + if isinstance(result.columns, pd.CategoricalIndex): + combined_columns = pd.CategoricalIndex(combined_columns) expected = DataFrame( [ [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], From 8afb17285bb36c1feef186a81044c7e1dd02918b Mon Sep 17 00:00:00 2001 From: jmarin Date: Sun, 3 Mar 2024 21:08:51 +0100 Subject: [PATCH 4/8] Handling properly all cases and adapt tests accordingly --- pandas/core/frame.py | 2 -- pandas/core/indexes/base.py | 8 ++++++-- pandas/tests/groupby/test_categorical.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fa29af99c9cce..e5d424b15e69e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -808,7 +808,6 @@ def __init__( if len(data) > 0: if is_dataclass(data[0]): data = dataclasses_to_dicts(data) - if not isinstance(data, np.ndarray) and treat_as_nested(data): # exclude ndarray as we may have cast it a few lines above if columns is not None: @@ -821,7 +820,6 @@ def __init__( index, # type: ignore[arg-type] dtype, ) - mgr = arrays_to_mgr( arrays, columns, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f446f3a55bd0f..ab2d3867992d4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2916,8 +2916,12 @@ def union(self, other, sort=None): if isinstance(self.dtype, CategoricalDtype) and isinstance( other.dtype, CategoricalDtype ): - # Unite both categories - both_categories = np.union1d(self.categories, other.categories) + both_categories = self.categories + if len(self.categories) != len(other.categories) or any( + self.categories != other.categories + ): + # Unite both categories + both_categories = np.union1d(self.categories, other.categories) # if ordered and unordered, we set categories to be unordered ordered = False if self.ordered != other.ordered else None # Convert both indexes to have the same categories diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c36e100aa0a0f..10eca5ea8427f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -714,7 +714,7 @@ def test_describe_categorical_columns(): # GH 11558 cats = CategoricalIndex( ["qux", "foo", "baz", "bar"], - categories=["bar", "baz", "foo", "qux"], + categories=["foo", "bar", "baz", "qux"], ordered=True, ) df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats) From 6a0b1af4799977e5e37412f99c49af17428c253e Mon Sep 17 00:00:00 2001 From: jmarin Date: Sun, 3 Mar 2024 22:23:48 +0100 Subject: [PATCH 5/8] Type: ignore[attr-define] when self and other are CategoricalIndex --- pandas/core/indexes/base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ab2d3867992d4..1045c41824e0e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2916,16 +2916,16 @@ def union(self, other, sort=None): if isinstance(self.dtype, CategoricalDtype) and isinstance( other.dtype, CategoricalDtype ): - both_categories = self.categories - if len(self.categories) != len(other.categories) or any( - self.categories != other.categories + both_categories = self.categories # type: ignore[attr-defined] + if len(self.categories) != len(other.categories) or any( # type: ignore[attr-defined] + self.categories != other.categories # type: ignore[attr-defined] ): # Unite both categories - both_categories = np.union1d(self.categories, other.categories) + both_categories = np.union1d(self.categories, other.categories) # type: ignore[attr-defined] # if ordered and unordered, we set categories to be unordered - ordered = False if self.ordered != other.ordered else None + ordered = False if self.ordered != other.ordered else None # type: ignore[attr-defined] # Convert both indexes to have the same categories - self = self.set_categories(both_categories, ordered=ordered) + self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] other = other.set_categories(both_categories, ordered=ordered) elif self.dtype != other.dtype: From f5e4148e53b9d5f01bc56f78a8091a6bc2627df5 Mon Sep 17 00:00:00 2001 From: jmarin Date: Wed, 6 Mar 2024 22:02:08 +0100 Subject: [PATCH 6/8] Use union_categoricals instead of union1d from numpy --- pandas/core/indexes/base.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1045c41824e0e..0b248107f0be2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -112,7 +112,10 @@ pandas_dtype, validate_all_hashable, ) -from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.concat import ( + concat_compat, + union_categoricals, +) from pandas.core.dtypes.dtypes import ( ArrowDtype, CategoricalDtype, @@ -2913,20 +2916,27 @@ def union(self, other, sort=None): self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) - if isinstance(self.dtype, CategoricalDtype) and isinstance( - other.dtype, CategoricalDtype - ): - both_categories = self.categories # type: ignore[attr-defined] - if len(self.categories) != len(other.categories) or any( # type: ignore[attr-defined] - self.categories != other.categories # type: ignore[attr-defined] - ): - # Unite both categories - both_categories = np.union1d(self.categories, other.categories) # type: ignore[attr-defined] + from pandas import CategoricalIndex + + if isinstance(self, CategoricalIndex) and isinstance(other, CategoricalIndex): + both_categories = self.categories # if ordered and unordered, we set categories to be unordered - ordered = False if self.ordered != other.ordered else None # type: ignore[attr-defined] + ordered = False if self.ordered != other.ordered else None + if len(self.categories) != len(other.categories) or any( + self.categories != other.categories + ): + if ordered is False: + both_categories = union_categoricals( + [self.as_unordered(), other.as_unordered()], # type: ignore[attr-defined] + sort_categories=True, + ).categories + else: + both_categories = union_categoricals( + [self, other], sort_categories=True + ).categories # Convert both indexes to have the same categories self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] - other = other.set_categories(both_categories, ordered=ordered) + other = other.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] elif self.dtype != other.dtype: if ( From cb3e6b697ec7078e3cbc8834d4a8999056827529 Mon Sep 17 00:00:00 2001 From: jmarin Date: Thu, 7 Mar 2024 23:30:06 +0100 Subject: [PATCH 7/8] Change from CategoricalIndex to CategoricalDtype check --- pandas/core/indexes/base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0b248107f0be2..d2783e2c7ef8c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2916,14 +2916,14 @@ def union(self, other, sort=None): self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) - from pandas import CategoricalIndex - - if isinstance(self, CategoricalIndex) and isinstance(other, CategoricalIndex): - both_categories = self.categories + if isinstance(self.dtype, CategoricalDtype) and isinstance( + other.dtype, CategoricalDtype + ): + both_categories = self.categories # type: ignore[attr-defined] # if ordered and unordered, we set categories to be unordered - ordered = False if self.ordered != other.ordered else None - if len(self.categories) != len(other.categories) or any( - self.categories != other.categories + ordered = False if self.ordered != other.ordered else None # type: ignore[attr-defined] + if len(self.categories) != len(other.categories) or any( # type: ignore[attr-defined] + self.categories != other.categories # type: ignore[attr-defined] ): if ordered is False: both_categories = union_categoricals( @@ -2936,7 +2936,7 @@ def union(self, other, sort=None): ).categories # Convert both indexes to have the same categories self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] - other = other.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] + other = other.set_categories(both_categories, ordered=ordered) elif self.dtype != other.dtype: if ( From b332805a2c25568580bb6c47c074aa34fc6e7cb0 Mon Sep 17 00:00:00 2001 From: jmarin Date: Wed, 27 Mar 2024 23:22:36 +0100 Subject: [PATCH 8/8] Improve code to handle it in the original conditional --- pandas/core/indexes/base.py | 41 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 33a04944655c6..5be2319088f9a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2916,15 +2916,23 @@ def union(self, other, sort=None): self._assert_can_do_setop(other) other, result_name = self._convert_can_do_setop(other) - if isinstance(self.dtype, CategoricalDtype) and isinstance( - other.dtype, CategoricalDtype - ): - both_categories = self.categories # type: ignore[attr-defined] - # if ordered and unordered, we set categories to be unordered - ordered = False if self.ordered != other.ordered else None # type: ignore[attr-defined] - if len(self.categories) != len(other.categories) or any( # type: ignore[attr-defined] - self.categories != other.categories # type: ignore[attr-defined] + if self.dtype != other.dtype: + if ( + isinstance(self, ABCMultiIndex) + and not is_object_dtype(_unpack_nested_dtype(other)) + and len(other) > 0 + ): + raise NotImplementedError( + "Can only union MultiIndex with MultiIndex or Index of tuples, " + "try mi.to_flat_index().union(other) instead." + ) + + if isinstance(self, ABCCategoricalIndex) and isinstance( + other, ABCCategoricalIndex ): + both_categories = self.categories + # if ordered and unordered, we set categories to be unordered + ordered = False if self.ordered != other.ordered else None if ordered is False: both_categories = union_categoricals( [self.as_unordered(), other.as_unordered()], # type: ignore[attr-defined] @@ -2934,20 +2942,11 @@ def union(self, other, sort=None): both_categories = union_categoricals( [self, other], sort_categories=True ).categories - # Convert both indexes to have the same categories - self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] - other = other.set_categories(both_categories, ordered=ordered) + # Convert both indexes to have the same categories + self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] + other = other.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined] + return self.union(other, sort=sort) - elif self.dtype != other.dtype: - if ( - isinstance(self, ABCMultiIndex) - and not is_object_dtype(_unpack_nested_dtype(other)) - and len(other) > 0 - ): - raise NotImplementedError( - "Can only union MultiIndex with MultiIndex or Index of tuples, " - "try mi.to_flat_index().union(other) instead." - ) self, other = self._dti_setop_align_tzs(other, "union") dtype = self._find_common_type_compat(other)