From a2680b97a1b2c83ab97fb8ce29c3cd50a31678af Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 23 Oct 2017 10:02:37 +0200 Subject: [PATCH] BUG: fix dtype of all-NaN categories and MultiIndex levels --- asv_bench/benchmarks/categoricals.py | 9 +++++++++ doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/categorical.py | 19 +++++++++++++++---- pandas/tests/indexes/test_multi.py | 13 +++++++------ pandas/tests/reshape/test_concat.py | 2 +- .../tests/reshape/test_union_categoricals.py | 7 ++++--- 6 files changed, 37 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index d90c994b3d194..a5bb5e790dec1 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -26,6 +26,9 @@ def setup(self): self.datetimes = pd.Series(pd.date_range( '1995-01-01 00:00:00', periods=10000, freq='s')) + self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) + self.values_all_nan = [np.nan] * len(self.values) + def time_concat(self): concat([self.s, self.s]) @@ -46,6 +49,12 @@ def time_constructor_datetimes_with_nat(self): t.iloc[-1] = pd.NaT Categorical(t) + def time_constructor_with_nan(self): + Categorical(self.values_some_nan) + + def time_constructor_all_nan(self): + Categorical(self.values_all_nan) + class Categoricals2(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cbd094ec4ef49..a51ffe7b344d1 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -41,6 +41,7 @@ Other API Changes ^^^^^^^^^^^^^^^^^ - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) +- All-NaN levels in ``MultiIndex`` are now assigned float rather than object dtype, coherently with flat indexes (:issue:`17929`). - :class:`Timestamp` will no longer silently ignore unused or invalid `tz` or `tzinfo` arguments (:issue:`17690`) - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the `tseries.offsets` module (:issue:`17830`) - diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 64d7a04d5c3ce..1e3c8f89c0e05 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -288,6 +288,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, self._dtype = dtype return + # null_mask indicates missing values we want to exclude from inference. + # This means: only missing values in list-likes (not arrays/ndframes). + null_mask = np.array(False) + # sanitize input if is_categorical_dtype(values): @@ -316,13 +320,14 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array - # On list with NaNs, int values will be converted to float. Use - # "object" dtype to prevent this. In the end objects will be - # casted to int/... in the category assignment step. - if len(values) == 0 or isna(values).any(): + # By convention, empty lists result in object dtype: + if len(values) == 0: sanitize_dtype = 'object' else: sanitize_dtype = None + null_mask = isna(values) + if null_mask.any(): + values = [values[idx] for idx in np.where(~null_mask)[0]] values = _sanitize_array(values, None, dtype=sanitize_dtype) if dtype.categories is None: @@ -370,6 +375,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) + if null_mask.any(): + # Reinsert -1 placeholders for previously removed missing values + full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) + full_codes[~null_mask] = codes + codes = full_codes + self._dtype = dtype self._codes = coerce_indexer_dtype(codes, dtype.categories) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 18bfc3d0efbee..ded5de9253eaf 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -970,12 +970,13 @@ def test_get_level_values_na(self): arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) - values = index.get_level_values(0) - expected = np.array([np.nan, np.nan, np.nan]) - tm.assert_numpy_array_equal(values.values.astype(float), expected) - values = index.get_level_values(1) - expected = np.array(['a', np.nan, 1], dtype=object) - tm.assert_numpy_array_equal(values.values, expected) + result = index.get_level_values(0) + expected = pd.Index([np.nan, np.nan, np.nan]) + tm.assert_index_equal(result, expected) + + result = index.get_level_values(1) + expected = pd.Index(['a', np.nan, 1]) + tm.assert_index_equal(result, expected) arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])] index = pd.MultiIndex.from_arrays(arrays) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 84a15cab34cd0..75fcfaad75cef 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -648,7 +648,7 @@ def test_concat_categorical_coercion_nan(self): s1 = pd.Series([np.nan, np.nan], dtype='category') s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object) + exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index eb80fb54b4016..3211574f834f5 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -90,7 +90,8 @@ def test_union_categoricals_nan(self): tm.assert_categorical_equal(res, exp) # all NaN - res = union_categoricals([pd.Categorical([np.nan, np.nan]), + res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan], + dtype=object)), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) @@ -250,7 +251,7 @@ def test_union_categoricals_sort(self): c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical([np.nan, np.nan], categories=[]) + expected = Categorical([np.nan, np.nan]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) @@ -299,7 +300,7 @@ def test_union_categoricals_sort_false(self): c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical([np.nan, np.nan], categories=[]) + expected = Categorical([np.nan, np.nan]) tm.assert_categorical_equal(result, expected) c1 = Categorical([])