diff --git a/doc/source/release.rst b/doc/source/release.rst index 848bd1035fadc..85d9be1295e29 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -573,6 +573,8 @@ Bug Fixes - Fix bound checking for Timestamp() with datetime64 input (:issue:`4065`) - Fix a bug where ``TestReadHtml`` wasn't calling the correct ``read_html()`` function (:issue:`5150`). + - Fix a bug with ``NDFrame.replace()`` which made replacement appear as + though it was (incorrectly) using regular expressions (:issue:`5143`). pandas 0.12.0 ------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 108b82eaf9056..33df305a721a6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -7,13 +7,8 @@ import numbers import codecs import csv -import sys import types -from datetime import timedelta - -from distutils.version import LooseVersion - from numpy.lib.format import read_array, write_array import numpy as np @@ -21,9 +16,7 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import (StringIO, BytesIO, range, long, u, zip, map, - string_types) -from datetime import timedelta +from pandas.compat import StringIO, BytesIO, range, long, u, zip, map from pandas.core.config import get_option from pandas.core import array as pa @@ -36,6 +29,7 @@ class PandasError(Exception): class AmbiguousIndexError(PandasError, KeyError): pass + _POSSIBLY_CAST_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', 'uint8', 'int16', 'uint16', 'int32', @@ -101,6 +95,7 @@ class to receive bound method else: setattr(cls, name, func) + def isnull(obj): """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) @@ -772,6 +767,7 @@ def diff(arr, n, axis=0): return out_arr + def _coerce_to_dtypes(result, dtypes): """ given a dtypes and a result set, coerce the result elements to the dtypes """ if len(result) != len(dtypes): @@ -800,6 +796,7 @@ def conv(r,dtype): return np.array([ conv(r,dtype) for r, dtype in zip(result,dtypes) ]) + def _infer_dtype_from_scalar(val): """ interpret the dtype from a scalar, upcast floats and ints return the new value and the dtype """ @@ -986,6 +983,7 @@ def changeit(): return result, False + def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): """ provide explicty type promotion and coercion @@ -1166,6 +1164,7 @@ def pad_1d(values, limit=None, mask=None): _method(values, mask, limit=limit) return values + def backfill_1d(values, limit=None, mask=None): dtype = values.dtype.name @@ -1190,6 +1189,7 @@ def backfill_1d(values, limit=None, mask=None): _method(values, mask, limit=limit) return values + def pad_2d(values, limit=None, mask=None): dtype = values.dtype.name @@ -1218,6 +1218,7 @@ def pad_2d(values, limit=None, mask=None): pass return values + def backfill_2d(values, limit=None, mask=None): dtype = values.dtype.name @@ -1246,6 +1247,7 @@ def backfill_2d(values, limit=None, mask=None): pass return values + def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): """ perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result """ @@ -1371,6 +1373,7 @@ def _possibly_convert_platform(values): return values + def _possibly_cast_to_datetime(value, dtype, coerce=False): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -1787,6 +1790,7 @@ def is_datetime64_dtype(arr_or_dtype): tipo = arr_or_dtype.dtype.type return issubclass(tipo, np.datetime64) + def is_datetime64_ns_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): tipo = arr_or_dtype @@ -1796,6 +1800,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): tipo = arr_or_dtype.dtype return tipo == _NS_DTYPE + def is_timedelta64_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): tipo = arr_or_dtype.type @@ -1851,6 +1856,7 @@ def _is_sequence(x): except (TypeError, AttributeError): return False + _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 _ensure_int64 = algos.ensure_int64 @@ -1987,6 +1993,7 @@ def _get_handle(path, mode, encoding=None, compression=None): return f + if compat.PY3: # pragma: no cover def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): # ignore encoding diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1bbaeffff77bc..daaf9d9966635 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12,7 +12,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0212,W0231,W0703,W0622 -import operator import sys import collections import warnings @@ -25,7 +24,7 @@ from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, _is_sequence, _infer_dtype_from_scalar, _values_from_object, - _coerce_to_dtypes, _DATELIKE_DTYPES, is_list_like) + _DATELIKE_DTYPES, is_list_like) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_droplevels, @@ -48,7 +47,6 @@ from pandas.tseries.index import DatetimeIndex import pandas.core.algorithms as algos -import pandas.core.datetools as datetools import pandas.core.common as com import pandas.core.format as fmt import pandas.core.nanops as nanops @@ -4292,6 +4290,7 @@ def combineMult(self, other): """ return self.mul(other, fill_value=1.) + DataFrame._setup_axes( ['index', 'columns'], info_axis=1, stat_axis=0, axes_are_reversed=True) DataFrame._add_numeric_operations() @@ -4552,6 +4551,7 @@ def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): mgr = mgr.copy() return mgr + def _reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns if columns is not None and len(columns) and arr_columns is not None and len(arr_columns): @@ -4562,6 +4562,7 @@ def _reorder_arrays(arrays, arr_columns, columns): arrays = [arrays[i] for i in indexer] return arrays, arr_columns + def _list_to_arrays(data, columns, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d8a03cef16c9e..bb47709532523 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -18,9 +18,7 @@ from pandas import compat, _np_version_under1p7 from pandas.compat import map, zip, lrange, string_types, isidentifier from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, - _infer_dtype_from_scalar, _maybe_promote, - ABCSeries) + _values_from_object, _maybe_promote, ABCSeries) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution @@ -36,6 +34,7 @@ def is_dictlike(x): return isinstance(x, (dict, com.ABCSeries)) + def _single_replace(self, to_replace, method, inplace, limit): orig_dtype = self.dtype result = self if inplace else self.copy() @@ -1844,7 +1843,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, self._consolidate_inplace() if value is None: - if isinstance(to_replace, list): + if isinstance(to_replace, (tuple, list)): return _single_replace(self, to_replace, method, inplace, limit) @@ -1856,7 +1855,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, to_replace = regex regex = True - items = to_replace.items() + items = list(compat.iteritems(to_replace)) keys, values = zip(*items) are_mappings = [is_dictlike(v) for v in values] @@ -1899,7 +1898,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, regex=regex) # {'A': NA} -> 0 - elif not isinstance(value, (list, np.ndarray)): + elif not com.is_list_like(value): new_data = self._data for k, src in compat.iteritems(to_replace): if k in self: @@ -1911,9 +1910,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, raise TypeError('Fill value must be scalar, dict, or ' 'Series') - elif isinstance(to_replace, (list, np.ndarray)): - # [NA, ''] -> [0, 'missing'] - if isinstance(value, (list, np.ndarray)): + elif com.is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if com.is_list_like(value): if len(to_replace) != len(value): raise ValueError('Replacement lists must match ' 'in length. Expecting %d got %d ' % @@ -1928,11 +1926,13 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, inplace=inplace, regex=regex) elif to_replace is None: if not (com.is_re_compilable(regex) or - isinstance(regex, (list, np.ndarray)) or is_dictlike(regex)): + com.is_list_like(regex) or + is_dictlike(regex)): raise TypeError("'regex' must be a string or a compiled " "regular expression or a list or dict of " "strings or regular expressions, you " - "passed a {0}".format(type(regex))) + "passed a" + " {0!r}".format(type(regex).__name__)) return self.replace(regex, value, inplace=inplace, limit=limit, regex=True) else: @@ -1948,12 +1948,13 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, inplace=inplace, regex=regex) - elif not isinstance(value, (list, np.ndarray)): # NA -> 0 + elif not com.is_list_like(value): # NA -> 0 new_data = self._data.replace(to_replace, value, inplace=inplace, regex=regex) else: - raise TypeError('Invalid "to_replace" type: ' - '{0}'.format(type(to_replace))) # pragma: no cover + msg = ('Invalid "to_replace" type: ' + '{0!r}').format(type(to_replace).__name__) + raise TypeError(msg) # pragma: no cover new_data = new_data.convert(copy=not inplace, convert_numeric=False) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9abcdd8ea4780..070745d73b307 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -992,6 +992,7 @@ class NumericBlock(Block): is_numeric = True _can_hold_na = True + class FloatBlock(NumericBlock): is_float = True _downcast_dtype = 'int64' @@ -1064,6 +1065,7 @@ def _try_cast(self, element): def should_store(self, value): return com.is_integer_dtype(value) and value.dtype == self.dtype + class TimeDeltaBlock(IntBlock): is_timedelta = True _can_hold_na = True @@ -1130,6 +1132,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): for val in values.ravel()[imask]], dtype=object) return rvalues.tolist() + class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False @@ -1677,6 +1680,7 @@ def split_block_at(self, item): def _try_cast_result(self, result, dtype=None): return result + def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, fastpath=False, placement=None): if klass is None: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6f8031538e520..2e386a7e2816a 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -17,10 +17,10 @@ map, zip, range, long, lrange, lmap, lzip, OrderedDict, cPickle as pickle, u, StringIO ) -from pandas import compat, _np_version_under1p7 +from pandas import compat from numpy import random, nan -from numpy.random import randn, rand +from numpy.random import randn import numpy as np import numpy.ma as ma from numpy.testing import assert_array_equal @@ -47,9 +47,6 @@ ensure_clean) from pandas.core.indexing import IndexingError from pandas.core.common import PandasError -from pandas.compat import OrderedDict -from pandas.computation.expr import Expr -import pandas.computation as comp import pandas.util.testing as tm import pandas.lib as lib @@ -2367,7 +2364,6 @@ def test_insert_error_msmgs(self): with assertRaisesRegexp(TypeError, msg): df['gr'] = df.groupby(['b', 'c']).count() - def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), @@ -2498,7 +2494,6 @@ def test_constructor_ndarray(self): frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) self.assertEqual(len(frame), 2) - def test_constructor_maskedarray(self): self._check_basic_constructor(ma.masked_all) @@ -3052,7 +3047,6 @@ def test_constructor_column_duplicates(self): [('a', [8]), ('a', [5]), ('b', [6])], columns=['b', 'a', 'a']) - def test_column_dups_operations(self): def check(result, expected=None): @@ -6845,7 +6839,7 @@ def test_replace_inplace(self): self.tsframe['A'][-5:] = nan tsframe = self.tsframe.copy() - res = tsframe.replace(nan, 0, inplace=True) + tsframe.replace(nan, 0, inplace=True) assert_frame_equal(tsframe, self.tsframe.fillna(0)) self.assertRaises(TypeError, self.tsframe.replace, nan, inplace=True) @@ -7618,6 +7612,46 @@ def test_replace_input_formats(self): def test_replace_limit(self): pass + def test_replace_dict_no_regex(self): + answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: + 'Disagree', 4: 'Strongly Disagree'}) + weights = {'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree': + 5, 'Strongly Disagree': 1} + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_series_no_regex(self): + answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: + 'Disagree', 4: 'Strongly Disagree'}) + weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3, + 'Strongly Agree': 5, 'Strongly Disagree': 1}) + expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) + result = answer.replace(weights) + tm.assert_series_equal(result, expected) + + def test_replace_dict_tuple_list_ordering_remains_the_same(self): + df = DataFrame(dict(A=[nan, 1])) + res1 = df.replace(to_replace={nan: 0, 1: -1e8}) + res2 = df.replace(to_replace=(1, nan), value=[-1e8, 0]) + res3 = df.replace(to_replace=[1, nan], value=[-1e8, 0]) + + expected = DataFrame({'A': [0, -1e8]}) + tm.assert_frame_equal(res1, res2) + tm.assert_frame_equal(res2, res3) + tm.assert_frame_equal(res3, expected) + + def test_replace_doesnt_replace_with_no_regex(self): + from pandas.compat import StringIO + raw = """fol T_opp T_Dir T_Enh + 0 1 0 0 vo + 1 2 vr 0 0 + 2 2 0 0 0 + 3 3 0 bt 0""" + df = read_csv(StringIO(raw), sep=r'\s+') + res = df.replace({'\D': 1}) + tm.assert_frame_equal(df, res) + def test_combine_multiple_frames_dtypes(self): from pandas import concat @@ -8713,7 +8747,6 @@ def test_apply_ignore_failures(self): expected = self.mixed_frame._get_numeric_data().apply(np.mean) assert_series_equal(result, expected) - def test_apply_mixed_dtype_corner(self): df = DataFrame({'A': ['foo'], 'B': [1.]})