diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 91992da594288..c589b72fa2895 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -91,6 +91,22 @@ def test_setitem_tuple(self): cat[1] = cat[0] assert cat[1] == (0, 1) + def test_setitem_listlike(self): + + # GH#9469 + # properly coerce the input indexers + np.random.seed(1) + c = Categorical( + np.random.randint(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) + indexer = np.array([100000]).astype(np.int64) + c[indexer] = -1000 + + # we are asserting the code result here + # which maps to the -1000 category + result = c.codes[np.array([100000]).astype(np.int64)] + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) + class TestCategoricalIndexing: def test_getitem_slice(self): diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 5a5aac87b057d..9e50d2889f9ad 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -5,8 +5,18 @@ import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + date_range, + isna, +) import pandas._testing as tm +from pandas.api.types import CategoricalDtype as CDT import pandas.core.common as com @@ -745,3 +755,94 @@ def test_reindex_multi_categorical_time(self): result = df2.reindex(midx) expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) tm.assert_frame_equal(result, expected) + + def test_reindex_with_categoricalindex(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + }, + index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), + ) + + # reindexing + # convert to a regular index + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list("cabe") + + result = df.reindex(Categorical(["a", "e"], categories=cats)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + # give back the type of categorical that we received + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} + ).set_index("B") + tm.assert_frame_equal(result, expected, check_index_type=True) + + df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + ) + # passed duplicate indexers are not allowed + msg = "cannot reindex from a duplicate axis" + with pytest.raises(ValueError, match=msg): + df2.reindex(["a", "b"]) + + # args NotImplemented ATM + msg = r"argument {} is not implemented for CategoricalIndex\.reindex" + with pytest.raises(NotImplementedError, match=msg.format("method")): + df.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + df.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + df.reindex(["a"], limit=2) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 50d643cf83d7f..16d451a12efc0 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import ( CategoricalDtype, + CategoricalIndex, DataFrame, Index, IntervalIndex, @@ -495,7 +496,7 @@ def test_sort_index_categorical_multiindex(self): columns=["a"], index=MultiIndex( levels=[ - pd.CategoricalIndex( + CategoricalIndex( ["c", "a", "b"], categories=["c", "a", "b"], ordered=True, @@ -736,6 +737,34 @@ def test_sort_index_multilevel_repr_8017(self, gen, extra): result = result.sort_index(axis=1) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_sort_index_with_categories(self, categories): + # GH#23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameSortIndexKey: def test_sort_multi_index_key(self): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index d59dc08b94563..7a1f0a35e1486 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import Categorical, DataFrame, NaT, Timestamp, date_range import pandas._testing as tm @@ -711,3 +713,90 @@ def sorter(key): ) tm.assert_frame_equal(result, expected) + + +@pytest.fixture +def df_none(): + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) + + +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +class TestSortValuesLevelAsStr: + def test_sort_index_level_and_column_label( + self, df_none, df_idx, sort_names, ascending + ): + # GH#14353 + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) + + tm.assert_frame_equal(result, expected) + + def test_sort_column_level_and_index_label( + self, df_none, df_idx, sort_names, ascending + ): + # GH#14353 + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) + + if len(levels) > 1: + # Accessing multi-level columns that are not lexsorted raises a + # performance warning + with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): + tm.assert_frame_equal(result, expected) + else: + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 69f36f039e6db..1521f66a6bc61 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2699,6 +2699,13 @@ def test_frame_ctor_datetime64_column(self): class TestDataFrameConstructorWithDatetimeTZ: + def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): + # GH#25843 + tz = tz_aware_fixture + result = DataFrame({"d": [Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") + expected = DataFrame({"d": [Timestamp("2019")]}) + tm.assert_frame_equal(result, expected) + def test_from_dict(self): # 8260 diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index a2e1f2398e711..eba92cc71a6d0 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, period_range +from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm @@ -341,3 +341,24 @@ def test_merge_join_different_levels(self): with tm.assert_produces_warning(UserWarning): result = df1.join(df2, on="a") tm.assert_frame_equal(result, expected) + + def test_frame_join_tzaware(self): + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") + expected = test1.index.union(test2.index) + + tm.assert_index_equal(result.index, expected) + assert result.index.tz.zone == "US/Central" diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py deleted file mode 100644 index 40526ab27ac9a..0000000000000 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np -import pytest - -from pandas.errors import PerformanceWarning - -from pandas import DataFrame -import pandas._testing as tm - - -@pytest.fixture -def df_none(): - return DataFrame( - { - "outer": ["a", "a", "a", "b", "b", "b"], - "inner": [1, 2, 2, 2, 1, 1], - "A": np.arange(6, 0, -1), - ("B", 5): ["one", "one", "two", "two", "one", "one"], - } - ) - - -@pytest.fixture(params=[["outer"], ["outer", "inner"]]) -def df_idx(request, df_none): - levels = request.param - return df_none.set_index(levels) - - -@pytest.fixture( - params=[ - "inner", # index level - ["outer"], # list of index level - "A", # column - [("B", 5)], # list of column - ["inner", "outer"], # two index levels - [("B", 5), "outer"], # index level and column - ["A", ("B", 5)], # Two columns - ["inner", "outer"], # two index levels and column - ] -) -def sort_names(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def ascending(request): - return request.param - - -def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending): - - # GH 14353 - - # Get index levels from df_idx - levels = df_idx.index.names - - # Compute expected by sorting on columns and the setting index - expected = df_none.sort_values( - by=sort_names, ascending=ascending, axis=0 - ).set_index(levels) - - # Compute result sorting on mix on columns and index levels - result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) - - tm.assert_frame_equal(result, expected) - - -def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending): - - # GH 14353 - - # Get levels from df_idx - levels = df_idx.index.names - - # Compute expected by sorting on axis=0, setting index levels, and then - # transposing. For some cases this will result in a frame with - # multiple column levels - expected = ( - df_none.sort_values(by=sort_names, ascending=ascending, axis=0) - .set_index(levels) - .T - ) - - # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) - - if len(levels) > 1: - # Accessing multi-level columns that are not lexsorted raises a - # performance warning - with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): - tm.assert_frame_equal(result, expected) - else: - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index f3667c4dd9d9d..22ffb30324366 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import DataFrame, to_datetime @@ -6,39 +7,41 @@ class TestDataFrameTimeSeriesMethods: - def test_frame_append_datetime64_col_other_units(self): + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_append_datetime64_col_other_units(self, unit): n = 100 - units = ["h", "m", "s", "ms", "D", "M", "Y"] - ns_dtype = np.dtype("M8[ns]") - for unit in units: - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) - df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) - df[unit] = vals + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df[unit] = vals - ex_vals = to_datetime(vals.astype("O")).values + ex_vals = to_datetime(vals.astype("O")).values - assert df[unit].dtype == ns_dtype - assert (df[unit].values == ex_vals).all() + assert df[unit].dtype == ns_dtype + assert (df[unit].values == ex_vals).all() + @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) + def test_frame_setitem_existing_datetime64_col_other_units(self, unit): # Test insertion into existing datetime64 column + n = 100 + ns_dtype = np.dtype("M8[ns]") + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) - for unit in units: - dtype = np.dtype(f"M8[{unit}]") - vals = np.arange(n, dtype=np.int64).view(dtype) + dtype = np.dtype(f"M8[{unit}]") + vals = np.arange(n, dtype=np.int64).view(dtype) - tmp = df.copy() + tmp = df.copy() - tmp["dates"] = vals - ex_vals = to_datetime(vals.astype("O")).values + tmp["dates"] = vals + ex_vals = to_datetime(vals.astype("O")).values - assert (tmp["dates"].values == ex_vals).all() + assert (tmp["dates"].values == ex_vals).all() def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index bb4e7a157f53e..3d814f22ce262 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -6,34 +6,12 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype -import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.indexes.datetimes import date_range class TestDataFrameTimezones: - def test_frame_join_tzaware(self): - test1 = DataFrame( - np.zeros((6, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" - ), - ) - test2 = DataFrame( - np.zeros((3, 3)), - index=date_range( - "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" - ), - columns=range(3, 6), - ) - - result = test1.join(test2, how="outer") - ex_index = test1.index.union(test2.index) - - tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == "US/Central" - @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_no_datetime64_dtype(self, tz): # after GH#7822 @@ -95,10 +73,3 @@ def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) ) tm.assert_frame_equal(result, expected) - - def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): - # GH 25843 - tz = tz_aware_fixture - result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") - expected = DataFrame({"d": [pd.Timestamp("2019")]}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py index 55e050ebdb2d8..1a326c1acea46 100644 --- a/pandas/tests/indexes/categorical/test_map.py +++ b/pandas/tests/indexes/categorical/test_map.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index +from pandas import CategoricalIndex, Index, Series import pandas._testing as tm @@ -56,7 +56,7 @@ def f(x): ) tm.assert_index_equal(result, exp) - result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) + result = ci.map(Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) result = ci.map({"A": 10, "B": 20, "C": 30}) @@ -65,8 +65,8 @@ def f(x): def test_map_with_categorical_series(self): # GH 12756 a = Index([1, 2, 3, 4]) - b = pd.Series(["even", "odd", "even", "odd"], dtype="category") - c = pd.Series(["even", "odd", "even", "odd"]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) tm.assert_index_equal(a.map(b), exp) @@ -80,8 +80,8 @@ def test_map_with_categorical_series(self): ([1, 2, np.nan], pd.isna), ([1, 1, np.nan], {1: False}), ([1, 2, np.nan], {1: False, 2: False}), - ([1, 1, np.nan], pd.Series([False, False])), - ([1, 2, np.nan], pd.Series([False, False, False])), + ([1, 1, np.nan], Series([False, False])), + ([1, 2, np.nan], Series([False, False, False])), ), ) def test_map_with_nan(self, data, f): # GH 24241 @@ -93,3 +93,19 @@ def test_map_with_nan(self, data, f): # GH 24241 else: expected = Index([False, False, np.nan]) tm.assert_index_equal(result, expected) + + def test_map_with_dict_or_series(self): + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = CategoricalIndex(orig_values, name="XXX") + expected = CategoricalIndex(new_values, name="XXX", categories=[3.0, 2, "one"]) + + mapper = Series(new_values[:-1], index=orig_values[:-1]) + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) + + mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} + result = cur_index.map(mapper) + # Order of categories in result can be different + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9b52c297ec688..854ca176fd2f4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -25,27 +25,15 @@ def setup_method(self, method): self.df = DataFrame( { "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(CDT(list("cab"))), - } - ).set_index("B") + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"), + ) self.df2 = DataFrame( { "A": np.arange(6, dtype="int64"), - "B": Series(list("aabbca")).astype(CDT(list("cabe"))), - } - ).set_index("B") - self.df3 = DataFrame( - { - "A": np.arange(6, dtype="int64"), - "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), - } - ).set_index("B") - self.df4 = DataFrame( - { - "A": np.arange(6, dtype="int64"), - "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), - } - ).set_index("B") + }, + index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), + ) def test_loc_scalar(self): result = self.df.loc["a"] @@ -446,22 +434,6 @@ def test_getitem_with_listlike(self): result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) - def test_setitem_listlike(self): - - # GH 9469 - # properly coerce the input indexers - np.random.seed(1) - c = Categorical( - np.random.randint(0, 5, size=150000).astype(np.int8) - ).add_categories([-1000]) - indexer = np.array([100000]).astype(np.int64) - c[indexer] = -1000 - - # we are asserting the code result here - # which maps to the -1000 category - result = c.codes[np.array([100000]).astype(np.int64)] - tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) - def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) @@ -530,91 +502,6 @@ def test_read_only_source(self): tm.assert_series_equal(rw_df.loc[1], ro_df.loc[1]) tm.assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) - def test_reindexing(self): - df = DataFrame( - { - "A": np.arange(3, dtype="int64"), - "B": Series(list("abc")).astype(CDT(list("cabe"))), - } - ).set_index("B") - - # reindexing - # convert to a regular index - result = df.reindex(["a", "b", "e"]) - expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( - "B" - ) - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b"]) - expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["e"]) - expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["d"]) - expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # since we are actually reindexing with a Categorical - # then return a Categorical - cats = list("cabe") - - result = df.reindex(Categorical(["a", "e"], categories=cats)) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(Categorical(["a"], categories=cats)) - expected = DataFrame( - {"A": [0], "B": Series(list("a")).astype(CDT(cats))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b", "e"]) - expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( - "B" - ) - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["a", "b"]) - expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(["e"]) - expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # give back the type of categorical that we received - result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) - expected = DataFrame( - {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} - ).set_index("B") - tm.assert_frame_equal(result, expected, check_index_type=True) - - # passed duplicate indexers are not allowed - msg = "cannot reindex from a duplicate axis" - with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "b"]) - - # args NotImplemented ATM - msg = r"argument {} is not implemented for CategoricalIndex\.reindex" - with pytest.raises(NotImplementedError, match=msg.format("method")): - df.reindex(["a"], method="ffill") - with pytest.raises(NotImplementedError, match=msg.format("level")): - df.reindex(["a"], level=1) - with pytest.raises(NotImplementedError, match=msg.format("limit")): - df.reindex(["a"], limit=2) - def test_loc_slice(self): # GH9748 with pytest.raises(KeyError, match="1"): @@ -635,10 +522,24 @@ def test_loc_and_at_with_categorical_index(self): assert df.loc["B", 1] == 4 assert df.at["B", 1] == 4 - def test_boolean_selection(self): + def test_getitem_bool_mask_categorical_index(self): - df3 = self.df3 - df4 = self.df4 + df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], dtype=CDT([3, 2, 1], ordered=True), name="B" + ), + ) + df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + }, + index=CategoricalIndex( + [1, 1, 2, 1, 3, 2], dtype=CDT([3, 2, 1], ordered=False), name="B" + ), + ) result = df3[df3.index == "a"] expected = df3.iloc[[]] @@ -699,24 +600,6 @@ def test_indexing_with_category(self): res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) - def test_map_with_dict_or_series(self): - orig_values = ["a", "B", 1, "a"] - new_values = ["one", 2, 3.0, "one"] - cur_index = pd.CategoricalIndex(orig_values, name="XXX") - expected = pd.CategoricalIndex( - new_values, name="XXX", categories=[3.0, 2, "one"] - ) - - mapper = Series(new_values[:-1], index=orig_values[:-1]) - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) - - mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} - output = cur_index.map(mapper) - # Order of categories in output can be different - tm.assert_index_equal(expected, output) - @pytest.mark.parametrize( "idx_values", [ @@ -781,31 +664,3 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "categories", - [ - pytest.param(["a", "b", "c"], id="str"), - pytest.param( - [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], - id="pd.Interval", - ), - ], - ) - def test_reorder_index_with_categories(self, categories): - # GH23452 - df = DataFrame( - {"foo": range(len(categories))}, - index=CategoricalIndex( - data=categories, categories=categories, ordered=True - ), - ) - df.index = df.index.reorder_categories(df.index.categories[::-1]) - result = df.sort_index() - expected = DataFrame( - {"foo": reversed(range(len(categories)))}, - index=CategoricalIndex( - data=categories[::-1], categories=categories[::-1], ordered=True - ), - ) - tm.assert_frame_equal(result, expected)