Skip to content

BLD/TST: fix bool block failures when strings are passed to replace list #6354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 15, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/source/missing_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,31 @@ You can also operate on the DataFrame in place

df.replace(1.5, nan, inplace=True)

.. warning::

When replacing multiple ``bool`` or ``datetime64`` objects, the first
argument to ``replace`` (``to_replace``) must match the type of the value
being replaced type. For example,

.. code-block::

s = Series([True, False, True])
s.replace({'a string': 'new value', True: False})

will raise a ``TypeError`` because one of the ``dict`` keys is not of the
correct type for replacement.

However, when replacing a *single* object such as,

.. code-block::

s = Series([True, False, True])
s.replace('a string', 'another string')

the original ``NDFrame`` object will be returned untouched. We're working on
unifying this API, but for backwards compatibility reasons we cannot break
the latter behavior. See :issue:`6354` for more details.

Missing data casting rules and indexing
---------------------------------------

Expand Down
14 changes: 6 additions & 8 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,20 +359,18 @@ def mask_missing(arr, values_to_mask):
if mask is None:
mask = arr == x

# if x is a string and mask is not, then we get a scalar
# return value, which is not good
if not isinstance(mask, np.ndarray):
m = mask
mask = np.empty(arr.shape, dtype=np.bool)
mask.fill(m)
# if x is a string and arr is not, then we get False and we must
# expand the mask to size arr.shape
if np.isscalar(mask):
mask = np.zeros(arr.shape, dtype=bool)
else:
mask = mask | (arr == x)
mask |= arr == x

if na_mask.any():
if mask is None:
mask = isnull(arr)
else:
mask = mask | isnull(arr)
mask |= isnull(arr)

return mask

Expand Down
21 changes: 20 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
import re
import operator
from datetime import datetime, timedelta
import copy
from collections import defaultdict
Expand Down Expand Up @@ -2453,7 +2454,8 @@ def replace_list(self, src_list, dest_list, inplace=False, regex=False):
def comp(s):
if isnull(s):
return isnull(values)
return values == getattr(s, 'asm8', s)
return _possibly_compare(values, getattr(s, 'asm8', s),
operator.eq)
masks = [comp(s) for i, s in enumerate(src_list)]

result_blocks = []
Expand Down Expand Up @@ -4153,3 +4155,20 @@ def _possibly_convert_to_indexer(loc):
elif isinstance(loc, slice):
loc = lrange(loc.start, loc.stop)
return loc


def _possibly_compare(a, b, op):
res = op(a, b)
is_a_array = isinstance(a, np.ndarray)
is_b_array = isinstance(b, np.ndarray)
if np.isscalar(res) and (is_a_array or is_b_array):
type_names = [type(a).__name__, type(b).__name__]

if is_a_array:
type_names[0] = 'ndarray(dtype=%s)' % a.dtype

if is_b_array:
type_names[1] = 'ndarray(dtype=%s)' % b.dtype

raise TypeError("Cannot compare types %r and %r" % tuple(type_names))
return res
5 changes: 2 additions & 3 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8005,9 +8005,8 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
df = DataFrame({0: [True, False], 1: [False, True]})
result = df.replace({'asdf': 'asdb', True: 'yes'})
expected = DataFrame({0: ['yes', False], 1: [False, 'yes']})
tm.assert_frame_equal(expected, result)
with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'):
df.replace({'asdf': 'asdb', True: 'yes'})

def test_combine_multiple_frames_dtypes(self):
from pandas import concat
Expand Down
11 changes: 7 additions & 4 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5264,7 +5264,11 @@ def test_replace(self):

# malformed
self.assertRaises(ValueError, ser.replace, [1, 2, 3], [np.nan, 0])
self.assertRaises(TypeError, ser.replace, range(1, 3), [np.nan, 0])

# make sure that we aren't just masking a TypeError because bools don't
# implement indexing
with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'):
ser.replace([1, 2], [np.nan, 0])

ser = Series([0, 1, 2, 3, 4])
result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])
Expand Down Expand Up @@ -5369,9 +5373,8 @@ def test_replace_bool_with_bool(self):

def test_replace_with_dict_with_bool_keys(self):
s = Series([True, False, True])
result = s.replace({'asdf': 'asdb', True: 'yes'})
expected = Series(['yes', False, 'yes'])
tm.assert_series_equal(expected, result)
with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'):
s.replace({'asdf': 'asdb', True: 'yes'})

def test_asfreq(self):
ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30),
Expand Down