From ced299f9a753c52fda67c665116569f7973270b7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 12 Oct 2018 13:22:38 -0500 Subject: [PATCH 01/23] ENH: Support EAs in Series.unstack --- pandas/core/reshape/reshape.py | 22 ++++++++++++++ pandas/tests/extension/base/reshaping.py | 38 ++++++++++++++++++++++++ pandas/tests/extension/decimal/array.py | 5 +++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 03b77f0e787f0..e9fe6ee731984 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -344,6 +344,7 @@ def _unstack_multiple(data, clocs, fill_value=None): if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index + unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames @@ -399,6 +400,8 @@ def unstack(obj, level, fill_value=None): else: return obj.T.stack(dropna=False) else: + if is_extension_array_dtype(obj.dtype): + return unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value, constructor=obj._constructor_expanddim) @@ -947,3 +950,22 @@ def make_axis_dummies(frame, axis='minor', transform=None): values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index) + + +def unstack_extension_series(series, level, fill_value): + from pandas.core.reshape.concat import concat + + dummy_arr = np.arange(len(series)) + # fill_value=-1, since we will do a series.values.take later + result = _Unstacker(dummy_arr, series.index, + level=level, fill_value=-1).get_result() + + out = [] + values = series.values + + for col, indicies in result.iteritems(): + out.append(Series(values.take(indicies.values, + allow_fill=True, + fill_value=fill_value), + name=col, index=result.index)) + return concat(out, axis='columns') diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 7f13c2cd67373..fa9b168a2b522 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -1,3 +1,4 @@ +import itertools import pytest import numpy as np @@ -170,3 +171,40 @@ def test_merge(self, data, na_value): [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype)}) self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + + @pytest.mark.parametrize("index", [ + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']])), + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], ['x', 'y', 'z']])), + + # non-uniform + pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), + + # three levels, non-uniform + pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]), + pd.MultiIndex.from_tuples([ + ('A', 'a', 1), + ('A', 'b', 0), + ('A', 'a', 0), + ('B', 'a', 0), + ('B', 'c', 1), + ]), + ]) + def test_unstack(self, data, index): + data = data[:len(index)] + ser = pd.Series(data, index=index) + + n = index.nlevels + levels = list(range(n)) + # [0, 1, 2] + # -> [(0,), (1,), (2,) (0, 1), (1, 0)] + combinations = itertools.chain.from_iterable( + itertools.permutations(levels, i) for i in range(1, n) + ) + + for level in combinations: + result = ser.unstack(level=level) + assert all(isinstance(result[col].values, type(data)) for col in result.columns) + expected = ser.astype(object).unstack(level=level) + result = result.astype(object) + + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index fe07aae61c5e2..8c6333c7ce8ee 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -102,7 +102,10 @@ def copy(self, deep=False): def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) - return super(DecimalArray, self).astype(dtype, copy) + # need to replace decimal NA + result = np.asarray(self, dtype=dtype) + result[self.isna()] = np.nan + return result def __setitem__(self, key, value): if pd.api.types.is_list_like(value): From 3b63fcbe82fb1a44498cc7e5d5ad2ba19428ab7c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Oct 2018 14:58:20 -0500 Subject: [PATCH 02/23] release note --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d0aa156cf5059..6d194acd8940b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -724,6 +724,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) +- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`). .. _whatsnew_0240.api.incompatibilities: From 756dde9273e59a92ac3ba3c27ef5e33bcfd3d96f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Oct 2018 15:05:49 -0500 Subject: [PATCH 03/23] xfail --- pandas/tests/extension/json/test_json.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 15d99f6c5d2fc..0eafc9558956e 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -138,7 +138,11 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): - pass + @pytest.mark.xfail(reason="dict for NA", strict=True) + def test_unstack(self, data, index): + # The base test has NaN for the expected NA value. + # this matches otherwise + return super().test_unstack(data, index) class TestGetitem(BaseJSON, base.BaseGetitemTests): From 90f84ef6f8f0d6c6ebc3336f97c2f77f1cfe75c4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Oct 2018 15:08:00 -0500 Subject: [PATCH 04/23] spelling --- pandas/core/reshape/reshape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e9fe6ee731984..fc85bfafd4ac7 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -963,8 +963,8 @@ def unstack_extension_series(series, level, fill_value): out = [] values = series.values - for col, indicies in result.iteritems(): - out.append(Series(values.take(indicies.values, + for col, indices in result.iteritems(): + out.append(Series(values.take(indices.values, allow_fill=True, fill_value=fill_value), name=col, index=result.index)) From 942db1b918f818281c5dcfd10a951e519a04dc42 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Oct 2018 16:12:23 -0500 Subject: [PATCH 05/23] lint --- pandas/tests/extension/base/reshaping.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index fa9b168a2b522..8367d02cc1af9 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -174,7 +174,8 @@ def test_merge(self, data, na_value): @pytest.mark.parametrize("index", [ pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']])), - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], ['x', 'y', 'z']])), + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], + ['x', 'y', 'z']])), # non-uniform pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), @@ -203,7 +204,8 @@ def test_unstack(self, data, index): for level in combinations: result = ser.unstack(level=level) - assert all(isinstance(result[col].values, type(data)) for col in result.columns) + assert all(isinstance(result[col].values, type(data)) + for col in result.columns) expected = ser.astype(object).unstack(level=level) result = result.astype(object) From 36a4450c01cf34ccf166d8e35371f404b3b901ae Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 06:08:45 -0500 Subject: [PATCH 06/23] no copy --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fc85bfafd4ac7..4c433ac1548ed 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -968,4 +968,4 @@ def unstack_extension_series(series, level, fill_value): allow_fill=True, fill_value=fill_value), name=col, index=result.index)) - return concat(out, axis='columns') + return concat(out, axis='columns', copy=False) From ee330d610584da54b338fd03180801c487357402 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 07:19:28 -0500 Subject: [PATCH 07/23] Fixup decimal tests --- pandas/tests/extension/decimal/array.py | 5 +--- .../tests/extension/decimal/test_decimal.py | 23 +++++++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 8c6333c7ce8ee..958fa44a07761 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -102,10 +102,7 @@ def copy(self, deep=False): def astype(self, dtype, copy=True): if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) - # need to replace decimal NA - result = np.asarray(self, dtype=dtype) - result[self.isna()] = np.nan - return result + return np.asarray(self, dtype=dtype) def __setitem__(self, key, value): if pd.api.types.is_list_like(value): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index be1c61166e4b1..73fac6eb39ee9 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -1,5 +1,6 @@ -import operator import decimal +import math +import operator import numpy as np import pandas as pd @@ -63,9 +64,23 @@ def data_for_grouping(): class BaseDecimal(object): def assert_series_equal(self, left, right, *args, **kwargs): - - left_na = left.isna() - right_na = right.isna() + def convert(x): + # need to convert array([Decimal(NaN)], dtype='object') to np.NaN + # because Series[object].isnan doesn't recognize decimal(NaN) as + # NA. + try: + return math.isnan(x) + except TypeError: + return False + + if left.dtype == 'object': + left_na = left.apply(convert) + else: + left_na = left.isna() + if right.dtype == 'object': + right_na = right.apply(convert) + else: + right_na = right.isna() tm.assert_series_equal(left_na, right_na) return tm.assert_series_equal(left[~left_na], From e9498a1470a7953108bc0021c91bd8d3571555c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 23 Oct 2018 16:28:38 -0500 Subject: [PATCH 08/23] update --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/reshape/reshape.py | 67 +++++++++++++++++------- pandas/tests/extension/base/reshaping.py | 2 +- pandas/tests/frame/test_reshape.py | 7 +-- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a0ba4ea578387..74bee471444dc 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -975,6 +975,7 @@ Categorical - Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) - Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). +- In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 4c433ac1548ed..640063224628a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -401,7 +401,7 @@ def unstack(obj, level, fill_value=None): return obj.T.stack(dropna=False) else: if is_extension_array_dtype(obj.dtype): - return unstack_extension_series(obj, level, fill_value) + return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value, constructor=obj._constructor_expanddim) @@ -422,6 +422,52 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() +def _unstack_extension_series(series, level, fill_value): + """ + Unstack an ExtensionArray-backed Series. + + The ExtensionDtype is preserved. + + Parameters + ---------- + series : Series + A Series with an ExtensionArray for values + level : Any + The level name or number. + fill_value : Any + The user-level (not physical storage) fill value to use for + missing values introduced by the reshape. Passed to + ``series.values.take``. + + Returns + ------- + DataFrame + Each column of the DataFrame will have the same dtype as + the input Series. + """ + # Implementation note: the basic idea is to + # 1. Do a regular unstack on a dummy array of integers + # 2. Followup with a columnwise take. + # We use the dummy take to discover newly-created missing values + # introduced by the reshape. + from pandas.core.reshape.concat import concat + + dummy_arr = np.arange(len(series)) + # fill_value=-1, since we will do a series.values.take later + result = _Unstacker(dummy_arr, series.index, + level=level, fill_value=-1).get_result() + + out = [] + values = series.values + + for col, indices in result.iteritems(): + out.append(Series(values.take(indices.values, + allow_fill=True, + fill_value=fill_value), + name=col, index=result.index)) + return concat(out, axis='columns', copy=False) + + def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the @@ -950,22 +996,3 @@ def make_axis_dummies(frame, axis='minor', transform=None): values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index) - - -def unstack_extension_series(series, level, fill_value): - from pandas.core.reshape.concat import concat - - dummy_arr = np.arange(len(series)) - # fill_value=-1, since we will do a series.values.take later - result = _Unstacker(dummy_arr, series.index, - level=level, fill_value=-1).get_result() - - out = [] - values = series.values - - for col, indices in result.iteritems(): - out.append(Series(values.take(indices.values, - allow_fill=True, - fill_value=fill_value), - name=col, index=result.index)) - return concat(out, axis='columns', copy=False) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 8367d02cc1af9..b47eb0c98f00a 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -197,7 +197,7 @@ def test_unstack(self, data, index): n = index.nlevels levels = list(range(n)) # [0, 1, 2] - # -> [(0,), (1,), (2,) (0, 1), (1, 0)] + # [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)] combinations = itertools.chain.from_iterable( itertools.permutations(levels, i) for i in range(1, n) ) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 9f6735c7ba2bf..24b6aaca960a4 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -314,9 +314,10 @@ def test_unstack_fill_frame_categorical(self): index=list('xyz')) assert_frame_equal(result, expected) - # Fill with non-category results in NaN entries similar to above - result = data.unstack(fill_value='d') - assert_frame_equal(result, expected) + # Fill with non-category results in a TypeError + msg = r"'fill_value' \('d'\) is not in" + with tm.assert_raises_regex(TypeError, msg): + data.unstack(fill_value='d') # Fill with category value replaces missing values as expected result = data.unstack(fill_value='c') From 72b5a0dcee2f16e414c5aab1c77348704bf04152 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 06:08:46 -0500 Subject: [PATCH 09/23] handle names --- pandas/core/reshape/reshape.py | 2 +- pandas/tests/extension/base/reshaping.py | 2 +- pandas/tests/frame/test_reshape.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 640063224628a..2f5e98bbfda36 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -465,7 +465,7 @@ def _unstack_extension_series(series, level, fill_value): allow_fill=True, fill_value=fill_value), name=col, index=result.index)) - return concat(out, axis='columns', copy=False) + return concat(out, axis='columns', copy=False, keys=result.columns) def stack(frame, level=-1, dropna=True): diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index b47eb0c98f00a..066fb182b50a4 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,7 +173,7 @@ def test_merge(self, data, na_value): self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) @pytest.mark.parametrize("index", [ - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']])), + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), names=['a', 'b']), pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], ['x', 'y', 'z']])), diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 24b6aaca960a4..cfd6399110c75 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -303,7 +303,8 @@ def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')], + ) # By default missing values will be NaN result = data.unstack() From 4d679cbc9d8551be9e856c2cdabbd0afd3abc16b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 06:09:16 -0500 Subject: [PATCH 10/23] lint --- pandas/tests/extension/base/reshaping.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 066fb182b50a4..5316d88ba64a1 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,7 +173,8 @@ def test_merge(self, data, na_value): self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) @pytest.mark.parametrize("index", [ - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), names=['a', 'b']), + pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), + names=['a', 'b']), pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], ['x', 'y', 'z']])), From ff7aba750f89cbd9c3ce3d80f22acf5744d0b401 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 08:05:26 -0500 Subject: [PATCH 11/23] handle DataFrame.unstack --- pandas/core/reshape/reshape.py | 7 +++++++ pandas/tests/extension/base/reshaping.py | 8 ++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2f5e98bbfda36..fbe5c3d04d888 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -409,6 +409,13 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): + from pandas.tools.merge import concat + + if (obj._is_homogeneous_type and + is_extension_array_dtype(obj.dtypes.iloc[0])): + frames = [ser.unstack(level=level, fill_value=fill_value) + for name, ser in obj.iteritems()] + return concat(frames, axis=1, keys=obj.columns) if obj._is_mixed_type: unstacker = partial(_Unstacker, index=obj.index, level=level, fill_value=fill_value) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5316d88ba64a1..5572f717a561c 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -191,9 +191,13 @@ def test_merge(self, data, na_value): ('B', 'c', 1), ]), ]) - def test_unstack(self, data, index): + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, data, index, obj): data = data[:len(index)] - ser = pd.Series(data, index=index) + if obj == "series": + ser = pd.Series(data, index=index) + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) n = index.nlevels levels = list(range(n)) From 49bdb50db44b8a6e0d67bfb59e0300bce6948718 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 08:17:06 -0500 Subject: [PATCH 12/23] handle DataFrame.unstack --- pandas/core/internals/blocks.py | 6 +++++- pandas/tests/extension/base/reshaping.py | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ce8a9103f008..de981672ed034 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -34,6 +34,7 @@ is_numeric_v_string_like, is_extension_type, is_extension_array_dtype, is_list_like, + is_sparse, is_re, is_re_compilable, pandas_dtype) @@ -632,7 +633,10 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, return self if klass is None: - if dtype == np.object_: + if is_sparse(self.values): + # Series[Sparse].astype(object) is sparse. + klass = ExtensionBlock + elif is_object_dtype(dtype): klass = ObjectBlock elif is_extension_array_dtype(dtype): klass = ExtensionBlock diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 5572f717a561c..563f247ba052b 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -173,10 +173,9 @@ def test_merge(self, data, na_value): self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) @pytest.mark.parametrize("index", [ + # Two levels, uniform. pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), names=['a', 'b']), - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b'], - ['x', 'y', 'z']])), # non-uniform pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), From cf8ed731b8fe8eb6832a6c0e8b886863362bf95d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 09:23:50 -0500 Subject: [PATCH 13/23] handle DataFrame.unstack --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/internals/blocks.py | 30 ++++++++++++++++++++++++++++++ pandas/core/reshape/reshape.py | 2 +- pandas/tests/frame/test_reshape.py | 15 +++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e72b0b820ee5d..e75a0b5fc1aff 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -808,7 +808,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). - Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) -- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`). +- :meth:`Series.unstack` and :meth:`DataFrame.unstack` no longer convert extension arrays to object-dtype ndarrays. Each column in the output ``DataFrame`` will now have the same dtype as the input (:issue:`23077`). .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index de981672ed034..b92d62e7e0ca4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import copy import warnings import inspect import re @@ -1954,6 +1955,35 @@ def shift(self, periods, axis=0): def _ftype(self): return getattr(self.values, '_pandas_ftype', Block._ftype) + def _unstack(self, unstacker_func, new_columns): + # I wonder if this is supported + fill_value = unstacker_func.keywords['fill_value'] + unstacker_func = copy.deepcopy(unstacker_func) + unstacker_func.keywords['fill_value'] = -1 + + # just get the index. Can maybe avoid this? + dummy_unstacker = unstacker_func(np.empty((0, 0))) + + dummy_arr = np.arange(len(dummy_unstacker.index)) + + unstacker = unstacker_func(dummy_arr) + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + mask = mask.any(0) + + new_values = [ + self.values.take(indices, allow_fill=True, + fill_value=fill_value) + for indices in new_values.T + ] + + blocks = [ + self.make_block_same_class(vals, [place]) + for vals, place in zip(new_values, new_placement) + ] + return blocks, mask + class NumericBlock(Block): __slots__ = () diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fbe5c3d04d888..aa85be4bdbc02 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -409,7 +409,7 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): - from pandas.tools.merge import concat + from pandas.core.reshape.concat import concat if (obj._is_homogeneous_type and is_extension_array_dtype(obj.dtypes.iloc[0])): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index cfd6399110c75..54511df4effad 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -874,6 +874,21 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('level', [0, 1]) + def test_unstack_mixed_extension_types(self, level): + index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)], + names=['a', 'b']) + df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]), + "B": pd.Categorical(['a', 'a', 'b'])}, index=index) + + result = df.unstack(level=level) + expected = df.astype(object).unstack(level=level) + + expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2, + index=result.columns) + tm.assert_series_equal(result.dtypes, expected_dtypes) + tm.assert_frame_equal(result.astype(object), expected) + @pytest.mark.parametrize("level", [0, 'baz']) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 From 5902b5ba1be9e10f482d21b0e2b037b7228264f6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Oct 2018 10:08:48 -0500 Subject: [PATCH 14/23] Slightly de-hackify --- pandas/core/internals/blocks.py | 67 +++++++++++++++++-------------- pandas/core/internals/managers.py | 10 ++++- pandas/core/reshape/reshape.py | 10 +---- 3 files changed, 47 insertions(+), 40 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b92d62e7e0ca4..19e832ef63c99 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -import copy +import functools import warnings import inspect import re @@ -1434,7 +1434,7 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1443,6 +1443,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1736,7 +1740,7 @@ def _slice(self, slicer): def _try_cast_result(self, result, dtype=None): return result - def _unstack(self, unstacker_func, new_columns): + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): """Return a list of unstacked blocks of self Parameters @@ -1745,6 +1749,10 @@ def _unstack(self, unstacker_func, new_columns): Partially applied unstacker. new_columns : Index All columns of the unstacked BlockManager. + n_rows : int + Only used in ExtensionBlock.unstack + fill_value : int + Only used in ExtensionBlock.unstack Returns ------- @@ -1756,11 +1764,11 @@ def _unstack(self, unstacker_func, new_columns): # NonConsolidatable blocks can have a single item only, so we return # one block per item unstacker = unstacker_func(self.values.T) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - mask = mask.any(0) + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) + new_values = new_values.T[mask] new_placement = new_placement[mask] @@ -1768,6 +1776,16 @@ def _unstack(self, unstacker_func, new_columns): for vals, place in zip(new_values, new_placement)] return blocks, mask + @staticmethod + def _get_unstack_items(unstacker, new_columns): + # shared with ExtensionBlock + new_items = unstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) + new_values, mask = unstacker.get_new_values() + + mask = mask.any(0) + return new_placement, new_values, mask + class ExtensionBlock(NonConsolidatableMixIn, Block): """Block for holding extension types. @@ -1955,32 +1973,21 @@ def shift(self, periods, axis=0): def _ftype(self): return getattr(self.values, '_pandas_ftype', Block._ftype) - def _unstack(self, unstacker_func, new_columns): - # I wonder if this is supported - fill_value = unstacker_func.keywords['fill_value'] - unstacker_func = copy.deepcopy(unstacker_func) - unstacker_func.keywords['fill_value'] = -1 - - # just get the index. Can maybe avoid this? - dummy_unstacker = unstacker_func(np.empty((0, 0))) - - dummy_arr = np.arange(len(dummy_unstacker.index)) + def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + dummy_arr = np.arange(n_rows) + dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) + unstacker = dummy_unstacker(dummy_arr) - unstacker = unstacker_func(dummy_arr) - new_items = unstacker.get_new_columns() - new_placement = new_columns.get_indexer(new_items) - new_values, mask = unstacker.get_new_values() - mask = mask.any(0) - - new_values = [ - self.values.take(indices, allow_fill=True, - fill_value=fill_value) - for indices in new_values.T - ] + new_placement, new_values, mask = self._get_unstack_items( + unstacker, new_columns + ) blocks = [ - self.make_block_same_class(vals, [place]) - for vals, place in zip(new_values, new_placement) + self.make_block_same_class( + self.values.take(indices, allow_fill=True, + fill_value=fill_value), + [place]) + for indices, place in zip(new_values.T, new_placement) ] return blocks, mask diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fc3a12a9da82a..0519c5e5abe33 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1405,18 +1405,21 @@ def canonicalize(block): return all(block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)) - def unstack(self, unstacker_func): + def unstack(self, unstacker_func, fill_value): """Return a blockmanager with all blocks unstacked. Parameters ---------- unstacker_func : callable A (partially-applied) ``pd.core.reshape._Unstacker`` class. + fill_value : Any + fill_value for newly introduced missing values. Returns ------- unstacked : BlockManager """ + n_rows = self.shape[-1] dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items) new_columns = dummy.get_new_columns() new_index = dummy.get_new_index() @@ -1427,7 +1430,10 @@ def unstack(self, unstacker_func): blocks, mask = blk._unstack( partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), - new_columns) + new_columns, + n_rows, + fill_value + ) new_blocks.extend(blocks) columns_mask.extend(mask) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index aa85be4bdbc02..9f2e0e783d7d6 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -409,17 +409,11 @@ def unstack(obj, level, fill_value=None): def _unstack_frame(obj, level, fill_value=None): - from pandas.core.reshape.concat import concat - - if (obj._is_homogeneous_type and - is_extension_array_dtype(obj.dtypes.iloc[0])): - frames = [ser.unstack(level=level, fill_value=fill_value) - for name, ser in obj.iteritems()] - return concat(frames, axis=1, keys=obj.columns) if obj._is_mixed_type: unstacker = partial(_Unstacker, index=obj.index, level=level, fill_value=fill_value) - blocks = obj._data.unstack(unstacker) + blocks = obj._data.unstack(unstacker, + fill_value=fill_value) return obj._constructor(blocks) else: unstacker = _Unstacker(obj.values, obj.index, level=level, From a75806ade3dccc29139b9d35ecd026da061a0746 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Oct 2018 07:39:19 -0500 Subject: [PATCH 15/23] docs, comments --- pandas/core/internals/blocks.py | 34 ++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 19e832ef63c99..ba8ad8b8a817d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -635,7 +635,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, if klass is None: if is_sparse(self.values): - # Series[Sparse].astype(object) is sparse. + # special case sparse, Series[Sparse].astype(object) is sparse klass = ExtensionBlock elif is_object_dtype(dtype): klass = ObjectBlock @@ -1776,8 +1776,30 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): for vals, place in zip(new_values, new_placement)] return blocks, mask - @staticmethod - def _get_unstack_items(unstacker, new_columns): + def _get_unstack_items(self, unstacker, new_columns): + """ + Get the placement, values, and mask for a Block unstack. + + This is shared between ObjectBlock and ExtensionBlock. They + differ in that ObjectBlock passes the values, while ExtensionBlock + passes the dummy ndarray of positions to be used by a take + later. + + Parameters + ---------- + unstacker : pandas.core.reshape.reshape._Unstacker + new_columns : Index + All columns of the unstacked BlockManager. + + Returns + ------- + new_placement : ndarray[int] + The placement of the new columns in `new_columns`. + new_values : Union[ndarray, ExtensionArray] + The first return value from _Unstacker.get_new_values. + mask : ndarray[bool] + The second return value from _Unstacker.get_new_values. + """ # shared with ExtensionBlock new_items = unstacker.get_new_columns() new_placement = new_columns.get_indexer(new_items) @@ -1974,6 +1996,12 @@ def _ftype(self): return getattr(self.values, '_pandas_ftype', Block._ftype) def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): + # ExtensionArray-safe unstack. + # We override ObjectBlock._unstack, which unstacks directly on the + # values of the array. For EA-backed blocks, this would require + # converting to a 2-D ndarray of objects. + # Instead, we unstack an ndarray of integer positions, followed by + # a `take` on the actual values. dummy_arr = np.arange(n_rows) dummy_unstacker = functools.partial(unstacker_func, fill_value=-1) unstacker = dummy_unstacker(dummy_arr) From 8ed7c73d27707f97ecfc44bba154f2059027f9d8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Oct 2018 07:42:15 -0500 Subject: [PATCH 16/23] unxfail test --- pandas/tests/frame/test_reshape.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 23489098614cd..54511df4effad 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -277,8 +277,6 @@ def test_unstack_fill_frame_timedelta(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) - @pytest.mark.xfail(reason="GH-23077", - strict=True) def test_unstack_fill_frame_period(self): # Test unstacking with period From b23234c35af4bb41a74e24f9e867a9cac856e41f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 Oct 2018 07:54:50 -0500 Subject: [PATCH 17/23] added benchmark --- asv_bench/benchmarks/reshape.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index bda486dba3b0f..1d7dc58aca5ed 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -49,21 +49,28 @@ def time_unstack(self): class Unstack(object): - def setup(self): + params = ['int', 'category'] + + def setup(self, dtype): m = 100 n = 1000 levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - values = np.arange(m * m * n).reshape(m * m, n) + if dtype == 'int': + values = np.arange(m * m * n).reshape(m * m, n) + else: + indices = np.random.randint(0, 52, size=(m * m, n)) + values = np.take(list(string.ascii_letters), indices) + self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] - def time_full_product(self): + def time_full_product(self, dtype): self.df.unstack() - def time_without_last_row(self): + def time_without_last_row(self, dtype): self.df2.unstack() From 19b7cfa90c61352968ffea6fe58e4ba27168f5ed Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Oct 2018 15:40:43 -0500 Subject: [PATCH 18/23] fix asv --- asv_bench/benchmarks/reshape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 1d7dc58aca5ed..a337968419afb 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -63,6 +63,7 @@ def setup(self, dtype): else: indices = np.random.randint(0, 52, size=(m * m, n)) values = np.take(list(string.ascii_letters), indices) + values = [pd.Categorical(v) for v in values.T] self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] From 2d78d42c7ab7ce8c5ef02b6e4cff2388b8f159a8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Nov 2018 11:52:13 -0600 Subject: [PATCH 19/23] CLN: remove dead code --- pandas/core/reshape/reshape.py | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 933621b58be44..2dca7cf0e6aa3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -12,12 +12,12 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, is_extension_array_dtype, is_list_like, - is_object_dtype, is_sparse, needs_i8_conversion) + is_object_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import notna from pandas import compat import pandas.core.algorithms as algos -from pandas.core.arrays import Categorical, SparseArray +from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import _factorize_from_iterable from pandas.core.frame import DataFrame from pandas.core.index import Index, MultiIndex @@ -82,28 +82,15 @@ class _Unstacker(object): def __init__(self, values, index, level=-1, value_columns=None, fill_value=None, constructor=None): - self.is_categorical = None - self.is_sparse = is_sparse(values) if values.ndim == 1: - if isinstance(values, Categorical): - self.is_categorical = values - values = np.array(values) - elif self.is_sparse: - # XXX: Makes SparseArray *dense*, but it's supposedly - # a single column at a time, so it's "doable" - values = values.values values = values[:, np.newaxis] self.values = values self.value_columns = value_columns self.fill_value = fill_value if constructor is None: - if self.is_sparse: - self.constructor = SparseDataFrame - else: - self.constructor = DataFrame - else: - self.constructor = constructor + constructor = DataFrame + self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -174,14 +161,6 @@ def get_result(self): columns = self.get_new_columns() index = self.get_new_index() - # may need to coerce categoricals here - if self.is_categorical is not None: - categories = self.is_categorical.categories - ordered = self.is_categorical.ordered - values = [Categorical(values[:, i], categories=categories, - ordered=ordered) - for i in range(values.shape[-1])] - return self.constructor(values, index=index, columns=columns) def get_new_values(self): From a9e6263ecedf572eff7e3db90abd387e69b9fa67 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Nov 2018 12:50:23 -0600 Subject: [PATCH 20/23] faster asv --- asv_bench/benchmarks/reshape.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index a337968419afb..67fdfb82e72c0 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -61,6 +61,10 @@ def setup(self, dtype): if dtype == 'int': values = np.arange(m * m * n).reshape(m * m, n) else: + # the category branch is ~20x slower than int. So we + # cut down the size a bit. Now it's only ~3x slower. + n = 50 + columns = columns[:n] indices = np.random.randint(0, 52, size=(m * m, n)) values = np.take(list(string.ascii_letters), indices) values = [pd.Categorical(v) for v in values.T] From 967c674a3ca65cf901a77fd910f33e9a1737850c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Nov 2018 07:53:05 -0600 Subject: [PATCH 21/23] API: decimal nan is na --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/_libs/missing.pyx | 5 ++++ pandas/tests/dtypes/test_missing.py | 38 +++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index f449ca532ae74..c8c5db61160dd 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1227,6 +1227,7 @@ Missing - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) - :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) - :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) +- :meth:`isna` now considers ``decimal.Decimal('NaN')`` a missing value (:issue:`23284`). MultiIndex diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index b8791359241ad..4fa96f652adaf 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import cython +import decimal from cython import Py_ssize_t import numpy as np @@ -33,6 +34,8 @@ cdef inline bint _check_all_nulls(object val): res = get_datetime64_value(val) == NPY_NAT elif util.is_timedelta64_object(val): res = get_timedelta64_value(val) == NPY_NAT + elif isinstance(val, decimal.Decimal): + return val.is_nan() else: res = 0 return res @@ -71,6 +74,8 @@ cpdef bint checknull(object val): return get_timedelta64_value(val) == NPY_NAT elif util.is_array(val): return False + elif isinstance(val, decimal.Decimal): + return val.is_nan() else: return val is None or util.is_nan(val) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 8f82db69a9213..0fa7388931ec2 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import decimal import pytest from warnings import catch_warnings, simplefilter import numpy as np @@ -248,6 +249,43 @@ def test_period(self): tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) + def test_decimal(self): + # scalars + a = decimal.Decimal(1.0) + assert pd.isna(a) is False + assert pd.notna(a) is True + + b = decimal.Decimal('NaN') + assert pd.isna(b) is True + assert pd.notna(b) is False + + # array + arr = np.array([a, b]) + expected = np.array([False, True]) + result = pd.isna(arr) + tm.assert_numpy_array_equal(result, expected) + + result = pd.notna(arr) + tm.assert_numpy_array_equal(result, ~expected) + + # series + ser = pd.Series(arr) + expected = pd.Series(expected) + result = pd.isna(ser) + tm.assert_series_equal(result, expected) + + result = pd.notna(ser) + tm.assert_series_equal(result, ~expected) + + # index + idx = pd.Index(arr) + expected = np.array([False, True]) + result = pd.isna(idx) + tm.assert_numpy_array_equal(result, expected) + + result = pd.notna(idx) + tm.assert_numpy_array_equal(result, ~expected) + def test_array_equivalent(): assert array_equivalent(np.array([np.nan, np.nan]), From 32bc3deac394a2c6fb9d5d792980eb73550ee51d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Nov 2018 09:31:23 -0600 Subject: [PATCH 22/23] Revert "API: decimal nan is na" This reverts commit 967c674a3ca65cf901a77fd910f33e9a1737850c. --- doc/source/whatsnew/v0.24.0.txt | 1 - pandas/_libs/missing.pyx | 5 ---- pandas/tests/dtypes/test_missing.py | 38 ----------------------------- 3 files changed, 44 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c40403509dd69..f6b619defc435 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1229,7 +1229,6 @@ Missing - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) - :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) - :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) -- :meth:`isna` now considers ``decimal.Decimal('NaN')`` a missing value (:issue:`23284`). MultiIndex diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 4fa96f652adaf..b8791359241ad 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import cython -import decimal from cython import Py_ssize_t import numpy as np @@ -34,8 +33,6 @@ cdef inline bint _check_all_nulls(object val): res = get_datetime64_value(val) == NPY_NAT elif util.is_timedelta64_object(val): res = get_timedelta64_value(val) == NPY_NAT - elif isinstance(val, decimal.Decimal): - return val.is_nan() else: res = 0 return res @@ -74,8 +71,6 @@ cpdef bint checknull(object val): return get_timedelta64_value(val) == NPY_NAT elif util.is_array(val): return False - elif isinstance(val, decimal.Decimal): - return val.is_nan() else: return val is None or util.is_nan(val) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 0fa7388931ec2..8f82db69a9213 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import decimal import pytest from warnings import catch_warnings, simplefilter import numpy as np @@ -249,43 +248,6 @@ def test_period(self): tm.assert_series_equal(isna(s), exp) tm.assert_series_equal(notna(s), ~exp) - def test_decimal(self): - # scalars - a = decimal.Decimal(1.0) - assert pd.isna(a) is False - assert pd.notna(a) is True - - b = decimal.Decimal('NaN') - assert pd.isna(b) is True - assert pd.notna(b) is False - - # array - arr = np.array([a, b]) - expected = np.array([False, True]) - result = pd.isna(arr) - tm.assert_numpy_array_equal(result, expected) - - result = pd.notna(arr) - tm.assert_numpy_array_equal(result, ~expected) - - # series - ser = pd.Series(arr) - expected = pd.Series(expected) - result = pd.isna(ser) - tm.assert_series_equal(result, expected) - - result = pd.notna(ser) - tm.assert_series_equal(result, ~expected) - - # index - idx = pd.Index(arr) - expected = np.array([False, True]) - result = pd.isna(idx) - tm.assert_numpy_array_equal(result, expected) - - result = pd.notna(idx) - tm.assert_numpy_array_equal(result, ~expected) - def test_array_equivalent(): assert array_equivalent(np.array([np.nan, np.nan]), From 56e5f2fc31669e6708ffb15340da91d6e9e696e3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Nov 2018 11:10:28 -0600 Subject: [PATCH 23/23] Fixed sparse test --- pandas/tests/sparse/test_pivot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index e7eba63e4e0b3..0e71048f51177 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -47,4 +47,5 @@ def test_pivot_table_multi(self): values=['D', 'E']) res_dense = pd.pivot_table(self.dense, index='A', columns='B', values=['D', 'E']) + res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense)