Skip to content

BUG: Ensure dataframe preserves categorical indices with categorial series #57635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
20 changes: 18 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@
PeriodArray,
)


__all__ = ["Index"]

_unsortable_types = frozenset(("mixed", "mixed-integer"))
Expand Down Expand Up @@ -2912,7 +2913,22 @@ def union(self, other, sort=None):
self._assert_can_do_setop(other)
other, result_name = self._convert_can_do_setop(other)

if self.dtype != other.dtype:
if isinstance(self.dtype, CategoricalDtype) and isinstance(
other.dtype, CategoricalDtype
):
both_categories = self.categories # type: ignore[attr-defined]
if len(self.categories) != len(other.categories) or any( # type: ignore[attr-defined]
self.categories != other.categories # type: ignore[attr-defined]
):
# Unite both categories
both_categories = np.union1d(self.categories, other.categories) # type: ignore[attr-defined]
# if ordered and unordered, we set categories to be unordered
ordered = False if self.ordered != other.ordered else None # type: ignore[attr-defined]
# Convert both indexes to have the same categories
self = self.set_categories(both_categories, ordered=ordered) # type: ignore[attr-defined]
other = other.set_categories(both_categories, ordered=ordered)

elif self.dtype != other.dtype:
if (
isinstance(self, ABCMultiIndex)
and not is_object_dtype(_unpack_nested_dtype(other))
Expand Down Expand Up @@ -3006,7 +3022,7 @@ def _union(self, other: Index, sort: bool | None):
else:
missing = algos.unique1d(self.get_indexer_non_unique(other)[1])

result: Index | MultiIndex | ArrayLike
result: Index | MultiIndex | CategoricalIndex | ArrayLike
if self._is_multi:
# Preserve MultiIndex to avoid losing dtypes
result = self.append(other.take(missing))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,7 +2349,7 @@ def test_construct_with_two_categoricalindex_series(self):
result = DataFrame([s1, s2])
expected = DataFrame(
np.array([[39, 6, 4, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]),
columns=["female", "male", "unknown", "f", "m"],
columns=CategoricalIndex(["female", "male", "unknown", "f", "m"]),
)
tm.assert_frame_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ def test_union_duplicate_index_subsets_of_each_other(

expected = Index([1, 2, 2, 3, 3, 4], dtype=dtype)
if isinstance(a, CategoricalIndex):
expected = Index([1, 2, 2, 3, 3, 4])
expected = CategoricalIndex([1, 2, 2, 3, 3, 4])
result = a.union(b)
tm.assert_index_equal(result, expected)
result = a.union(b, sort=False)
Expand Down Expand Up @@ -670,7 +670,7 @@ def test_union_with_duplicate_index_not_subset_and_non_monotonic(
b = Index([0, 0, 1], dtype=dtype)
expected = Index([0, 0, 1, 2], dtype=dtype)
if isinstance(a, CategoricalIndex):
expected = Index([0, 0, 1, 2])
expected = CategoricalIndex([0, 0, 1, 2])

result = a.union(b)
tm.assert_index_equal(result, expected)
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ def test_append_different_columns_types(self, df_columns, series_index):
result = df._append(ser)
idx_diff = ser.index.difference(df_columns)
combined_columns = Index(df_columns.tolist()).append(idx_diff)
if isinstance(result.columns, pd.CategoricalIndex):
combined_columns = pd.CategoricalIndex(combined_columns)
expected = DataFrame(
[
[1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
Expand Down