diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 73ec9c47bc473..8a2f647792f47 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -596,6 +596,31 @@ You can also operate on the DataFrame in place df.replace(1.5, nan, inplace=True) +.. warning:: + + When replacing multiple ``bool`` or ``datetime64`` objects, the first + argument to ``replace`` (``to_replace``) must match the type of the value + being replaced type. For example, + + .. code-block:: + + s = Series([True, False, True]) + s.replace({'a string': 'new value', True: False}) + + will raise a ``TypeError`` because one of the ``dict`` keys is not of the + correct type for replacement. + + However, when replacing a *single* object such as, + + .. code-block:: + + s = Series([True, False, True]) + s.replace('a string', 'another string') + + the original ``NDFrame`` object will be returned untouched. We're working on + unifying this API, but for backwards compatibility reasons we cannot break + the latter behavior. See :issue:`6354` for more details. + Missing data casting rules and indexing --------------------------------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index e895c8ed0cf2d..d97ccda4011d0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -359,20 +359,18 @@ def mask_missing(arr, values_to_mask): if mask is None: mask = arr == x - # if x is a string and mask is not, then we get a scalar - # return value, which is not good - if not isinstance(mask, np.ndarray): - m = mask - mask = np.empty(arr.shape, dtype=np.bool) - mask.fill(m) + # if x is a string and arr is not, then we get False and we must + # expand the mask to size arr.shape + if np.isscalar(mask): + mask = np.zeros(arr.shape, dtype=bool) else: - mask = mask | (arr == x) + mask |= arr == x if na_mask.any(): if mask is None: mask = isnull(arr) else: - mask = mask | isnull(arr) + mask |= isnull(arr) return mask diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 2fd579e5d2059..b83e7df746c5a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,5 +1,6 @@ import itertools import re +import operator from datetime import datetime, timedelta import copy from collections import defaultdict @@ -2453,7 +2454,8 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False): def comp(s): if isnull(s): return isnull(values) - return values == getattr(s, 'asm8', s) + return _possibly_compare(values, getattr(s, 'asm8', s), + operator.eq) masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] @@ -4153,3 +4155,20 @@ def _possibly_convert_to_indexer(loc): elif isinstance(loc, slice): loc = lrange(loc.start, loc.stop) return loc + + +def _possibly_compare(a, b, op): + res = op(a, b) + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + if np.isscalar(res) and (is_a_array or is_b_array): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = 'ndarray(dtype=%s)' % a.dtype + + if is_b_array: + type_names[1] = 'ndarray(dtype=%s)' % b.dtype + + raise TypeError("Cannot compare types %r and %r" % tuple(type_names)) + return res diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9322e1ca86c6c..f21bb9da4c1f3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8005,9 +8005,8 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - result = df.replace({'asdf': 'asdb', True: 'yes'}) - expected = DataFrame({0: ['yes', False], 1: [False, 'yes']}) - tm.assert_frame_equal(expected, result) + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + df.replace({'asdf': 'asdb', True: 'yes'}) def test_combine_multiple_frames_dtypes(self): from pandas import concat diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 38aae8ad2b905..06c8fdfef992c 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5264,7 +5264,11 @@ def test_replace(self): # malformed self.assertRaises(ValueError, ser.replace, [1, 2, 3], [np.nan, 0]) - self.assertRaises(TypeError, ser.replace, range(1, 3), [np.nan, 0]) + + # make sure that we aren't just masking a TypeError because bools don't + # implement indexing + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + ser.replace([1, 2], [np.nan, 0]) ser = Series([0, 1, 2, 3, 4]) result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) @@ -5369,9 +5373,8 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = Series([True, False, True]) - result = s.replace({'asdf': 'asdb', True: 'yes'}) - expected = Series(['yes', False, 'yes']) - tm.assert_series_equal(expected, result) + with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): + s.replace({'asdf': 'asdb', True: 'yes'}) def test_asfreq(self): ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30),