From ffc59b04d2328e3bf7059bec615f99e8dcfbf8a4 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 11 Apr 2016 05:03:59 +0900 Subject: [PATCH] PERF: Improve replace perf --- asv_bench/benchmarks/replace.py | 24 ++++++++++++++++++++++++ doc/source/whatsnew/v0.19.2.txt | 1 + pandas/core/generic.py | 23 +++++++++++++++-------- pandas/core/internals.py | 17 ++++++++++++----- 4 files changed, 52 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 869ddd8d6fa49..66b8af53801ac 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -32,6 +32,30 @@ def time_replace_large_dict(self): self.s.replace(self.to_rep, inplace=True) +class replace_convert(object): + goal_time = 0.5 + + def setup(self): + self.n = (10 ** 3) + self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) + self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) + self.s = Series(np.random.randint(self.n, size=(10 ** 3))) + self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), + 'B': np.random.randint(self.n, size=(10 ** 3))}) + + def time_replace_series_timestamp(self): + self.s.replace(self.to_ts) + + def time_replace_series_timedelta(self): + self.s.replace(self.to_td) + + def time_replace_frame_timestamp(self): + self.df.replace(self.to_ts) + + def time_replace_frame_timedelta(self): + self.df.replace(self.to_td) + + class replace_replacena(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 6ee6271929008..cafbdb731f494 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -21,6 +21,7 @@ Highlights include: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of ``.replace()`` (:issue:`12745`) .. _whatsnew_0192.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fbc6333dd6fdd..27ca817c19a63 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, res = self if inplace else self.copy() for c, src in compat.iteritems(to_replace): if c in value and c in self: + # object conversion is handled in + # series.replace which is called recursivelly res[c] = res[c].replace(to_replace=src, value=value[c], - inplace=False, regex=regex) + inplace=False, + regex=regex) return None if inplace else res # {'A': NA} -> 0 elif not is_list_like(value): - for k, src in compat.iteritems(to_replace): - if k in self: - new_data = new_data.replace(to_replace=src, - value=value, - filter=[k], - inplace=inplace, - regex=regex) + keys = [(k, src) for k, src in compat.iteritems(to_replace) + if k in self] + keys_len = len(keys) - 1 + for i, (k, src) in enumerate(keys): + convert = i == keys_len + new_data = new_data.replace(to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex, + convert=convert) else: raise TypeError('value argument must be scalar, dict, or ' 'Series') diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 43beefffd448e..120a9cbcd1a75 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -622,7 +622,6 @@ def replace(self, to_replace, value, inplace=False, filter=None, original_to_replace = to_replace mask = isnull(self.values) - # try to replace, if we raise an error, convert to ObjectBlock and # retry try: @@ -1795,13 +1794,14 @@ def should_store(self, value): return issubclass(value.dtype.type, np.bool_) def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, mgr=None): + regex=False, convert=True, mgr=None): to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self return super(BoolBlock, self).replace(to_replace, value, inplace=inplace, filter=filter, - regex=regex, mgr=mgr) + regex=regex, convert=convert, + mgr=mgr) class ObjectBlock(Block): @@ -3214,6 +3214,7 @@ def comp(s): masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] + src_len = len(src_list) - 1 for blk in self.blocks: # its possible to get multiple result blocks here @@ -3223,8 +3224,9 @@ def comp(s): new_rb = [] for b in rb: if b.dtype == np.object_: + convert = i == src_len result = b.replace(s, d, inplace=inplace, regex=regex, - mgr=mgr) + mgr=mgr, convert=convert) new_rb = _extend_blocks(result, new_rb) else: # get our mask for this element, sized to this @@ -4788,7 +4790,12 @@ def _putmask_smart(v, m, n): # change the dtype dtype, _ = _maybe_promote(n.dtype) - nv = v.astype(dtype) + + if is_extension_type(v.dtype) and is_object_dtype(dtype): + nv = v.get_values(dtype) + else: + nv = v.astype(dtype) + try: nv[m] = n[m] except ValueError: