From 676a4e5cef1cf37704ef702699db1fd6c89028ea Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Mon, 20 Mar 2017 19:32:02 -0300 Subject: [PATCH 1/8] Test --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index a181412be2719..efd075dabcba9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to pandas between each release can be found +The list of changes to Pandas between each release can be found [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. From 080c71ef23407b073d95661196dc87e0f58b4a9d Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Sun, 26 Mar 2017 23:29:15 -0300 Subject: [PATCH 2/8] BUG: replace of numeric by string fixed --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/core/missing.py | 15 ++++++++------- pandas/tests/series/test_replace.py | 10 ++++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 38109d5442751..25804bae98278 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -985,3 +985,5 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) + +- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3b9bfe1de48e7..53e0916bbad5c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -21,11 +21,16 @@ def mask_missing(arr, values_to_mask): Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ - if not isinstance(values_to_mask, (list, np.ndarray)): + if isinstance(values_to_mask, np.ndarray): + mask_type = values_to_mask.dtype.type + elif isinstance(values_to_mask, list): + mask_type = type(values_to_mask[0]) + else: + mask_type = type(values_to_mask) values_to_mask = [values_to_mask] try: - values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + values_to_mask = np.array(values_to_mask, dtype=mask_type) except Exception: values_to_mask = np.array(values_to_mask, dtype=object) @@ -409,7 +414,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, if axis != 0: # pragma: no cover raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") - values = values.reshape(tuple((1, ) + values.shape)) + values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: mask = None @@ -447,7 +452,6 @@ def wrapper(arr, mask, limit=None): def pad_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -472,7 +476,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None): def backfill_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -498,7 +501,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): def pad_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -528,7 +530,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None): def backfill_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 0a53581e24ba5..08d3f64c8a5a6 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -10,7 +10,6 @@ class TestSeriesReplace(TestData, tm.TestCase): - def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) @@ -101,7 +100,7 @@ def test_replace_gh5319(self): expected = ser.copy() expected.loc[2] = pd.Timestamp('20120101') result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + pd.Timestamp('20120101')}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp('20130103'), pd.Timestamp('20120101')) @@ -227,3 +226,10 @@ def test_replace_with_empty_dictlike(self): s = pd.Series(list('abcd')) tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) + + def test_replace_string_with_nan(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace('2', np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) From e62763c154f336022eee08dbd0656e21fc7cd4e4 Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Sun, 26 Mar 2017 23:56:14 -0300 Subject: [PATCH 3/8] Fixing PEP8 line indent --- pandas/tests/series/test_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 08d3f64c8a5a6..07dff4f719531 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -100,7 +100,7 @@ def test_replace_gh5319(self): expected = ser.copy() expected.loc[2] = pd.Timestamp('20120101') result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + pd.Timestamp('20120101')}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp('20130103'), pd.Timestamp('20120101')) From 97e1f1830d99c411e6a9da5394410dcd8b98490c Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Mon, 20 Mar 2017 19:32:02 -0300 Subject: [PATCH 4/8] Test --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index a181412be2719..efd075dabcba9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ Release Notes ============= -The list of changes to pandas between each release can be found +The list of changes to Pandas between each release can be found [here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. From 0a985575b42eb64d5dc104cb1cdfa6e21e57e9cf Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Sun, 26 Mar 2017 23:29:15 -0300 Subject: [PATCH 5/8] BUG: replace of numeric by string fixed --- doc/source/whatsnew/v0.20.0.txt | 2 ++ pandas/core/missing.py | 15 ++++++++------- pandas/tests/series/test_replace.py | 10 ++++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index fdf34e0d11572..8d9ac9d7672de 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -989,3 +989,5 @@ Bug Fixes - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) + +- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3b9bfe1de48e7..53e0916bbad5c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -21,11 +21,16 @@ def mask_missing(arr, values_to_mask): Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ - if not isinstance(values_to_mask, (list, np.ndarray)): + if isinstance(values_to_mask, np.ndarray): + mask_type = values_to_mask.dtype.type + elif isinstance(values_to_mask, list): + mask_type = type(values_to_mask[0]) + else: + mask_type = type(values_to_mask) values_to_mask = [values_to_mask] try: - values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + values_to_mask = np.array(values_to_mask, dtype=mask_type) except Exception: values_to_mask = np.array(values_to_mask, dtype=object) @@ -409,7 +414,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, if axis != 0: # pragma: no cover raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") - values = values.reshape(tuple((1, ) + values.shape)) + values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: mask = None @@ -447,7 +452,6 @@ def wrapper(arr, mask, limit=None): def pad_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -472,7 +476,6 @@ def pad_1d(values, limit=None, mask=None, dtype=None): def backfill_1d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -498,7 +501,6 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): def pad_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None @@ -528,7 +530,6 @@ def pad_2d(values, limit=None, mask=None, dtype=None): def backfill_2d(values, limit=None, mask=None, dtype=None): - if dtype is None: dtype = values.dtype _method = None diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 0a53581e24ba5..08d3f64c8a5a6 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -10,7 +10,6 @@ class TestSeriesReplace(TestData, tm.TestCase): - def test_replace(self): N = 100 ser = pd.Series(np.random.randn(N)) @@ -101,7 +100,7 @@ def test_replace_gh5319(self): expected = ser.copy() expected.loc[2] = pd.Timestamp('20120101') result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + pd.Timestamp('20120101')}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp('20130103'), pd.Timestamp('20120101')) @@ -227,3 +226,10 @@ def test_replace_with_empty_dictlike(self): s = pd.Series(list('abcd')) tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) + + def test_replace_string_with_nan(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace('2', np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) From 45e67e43d4683775627f11c84785d2af1aa2e081 Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Sun, 26 Mar 2017 23:56:14 -0300 Subject: [PATCH 6/8] Fixing PEP8 line indent --- pandas/tests/series/test_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 08d3f64c8a5a6..07dff4f719531 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -100,7 +100,7 @@ def test_replace_gh5319(self): expected = ser.copy() expected.loc[2] = pd.Timestamp('20120101') result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + pd.Timestamp('20120101')}) tm.assert_series_equal(result, expected) result = ser.replace(pd.Timestamp('20130103'), pd.Timestamp('20120101')) From 73805cef9e248ddc9ea758ade91bad1e7af430e2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Mar 2017 10:01:30 -0400 Subject: [PATCH 7/8] CLN: add infer_dtype_from_array --- doc/source/whatsnew/v0.20.0.txt | 5 ++- pandas/core/missing.py | 25 ++++++++------- pandas/tests/frame/test_replace.py | 25 +++++++++------ pandas/tests/types/test_cast.py | 50 +++++++++++++++++++++--------- pandas/types/cast.py | 44 ++++++++++++++++++++++++++ 5 files changed, 109 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8d9ac9d7672de..7106a5cc81b2b 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -883,6 +883,8 @@ Bug Fixes - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`) +- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) +- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`) @@ -985,9 +987,6 @@ Bug Fixes - Bug in ``DataFrame.hist`` where ``plt.tight_layout`` caused an ``AttributeError`` (use ``matplotlib >= 2.0.1``) (:issue:`9351`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) -- Bug in ``Series.replace`` and ``DataFrame.replace`` which failed on empty replacement dicts (:issue:`15289`) - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`) - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`) - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`) - -- Bug in ``Series.replace`` which replaced a numeric by string (:issue:`15743`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 53e0916bbad5c..b40a843675253 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -9,10 +9,16 @@ from pandas.compat import range, string_types from pandas.types.common import (is_numeric_v_string_like, - is_float_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, - _ensure_float64, is_scalar, - needs_i8_conversion, is_integer) + is_float_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_scalar, + is_integer, + needs_i8_conversion, + _ensure_float64) + +from pandas.types.cast import infer_dtype_from_array from pandas.types.missing import isnull @@ -21,16 +27,11 @@ def mask_missing(arr, values_to_mask): Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True """ - if isinstance(values_to_mask, np.ndarray): - mask_type = values_to_mask.dtype.type - elif isinstance(values_to_mask, list): - mask_type = type(values_to_mask[0]) - else: - mask_type = type(values_to_mask) - values_to_mask = [values_to_mask] + + dtype, values_to_mask = infer_dtype_from_array(values_to_mask) try: - values_to_mask = np.array(values_to_mask, dtype=mask_type) + values_to_mask = np.array(values_to_mask, dtype=dtype) except Exception: values_to_mask = np.array(values_to_mask, dtype=object) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 8b50036cd50f8..fce59e10bf4bd 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -795,7 +795,7 @@ def test_replace_dtypes(self): expected = DataFrame({'datetime64': Index([now] * 3)}) assert_frame_equal(result, expected) - def test_replace_input_formats(self): + def test_replace_input_formats_listlike(self): # both dicts to_rep = {'A': np.nan, 'B': 0, 'C': ''} values = {'A': 0, 'B': -1, 'C': 'missing'} @@ -812,15 +812,6 @@ def test_replace_input_formats(self): 'C': ['', 'asdf', 'fd']}) assert_frame_equal(result, expected) - # dict to scalar - filled = df.replace(to_rep, 0) - expected = {} - for k, v in compat.iteritems(df): - expected[k] = v.replace(to_rep[k], 0) - assert_frame_equal(filled, DataFrame(expected)) - - self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) - # scalar to dict values = {'A': 0, 'B': -1, 'C': 'missing'} df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], @@ -842,6 +833,20 @@ def test_replace_input_formats(self): self.assertRaises(ValueError, df.replace, to_rep, values[1:]) + def test_replace_input_formats_scalar(self): + df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], + 'C': ['', 'asdf', 'fd']}) + + # dict to scalar + to_rep = {'A': np.nan, 'B': 0, 'C': ''} + filled = df.replace(to_rep, 0) + expected = {} + for k, v in compat.iteritems(df): + expected[k] = v.replace(to_rep[k], 0) + assert_frame_equal(filled, DataFrame(expected)) + + self.assertRaises(TypeError, df.replace, to_rep, [np.nan, 0, '']) + # list to scalar to_rep = [np.nan, 0, ''] result = df.replace(to_rep, -1) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py index dd4ea3bb02be9..de6ef7af9d7f9 100644 --- a/pandas/tests/types/test_cast.py +++ b/pandas/tests/types/test_cast.py @@ -5,13 +5,15 @@ """ -from datetime import datetime +import pytest +from datetime import datetime, timedelta, date import numpy as np from pandas import Timedelta, Timestamp, DatetimeIndex from pandas.types.cast import (maybe_downcast_to_dtype, maybe_convert_objects, infer_dtype_from_scalar, + infer_dtype_from_array, maybe_convert_string_to_object, maybe_convert_scalar, find_common_type) @@ -82,7 +84,7 @@ def test_datetime_with_timezone(self): tm.assert_index_equal(res, exp) -class TestInferDtype(tm.TestCase): +class TestInferDtype(object): def test_infer_dtype_from_scalar(self): # Test that _infer_dtype_from_scalar is returning correct dtype for int @@ -92,44 +94,62 @@ def test_infer_dtype_from_scalar(self): np.int32, np.uint64, np.int64]: data = dtypec(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) + assert dtype == type(data) data = 12 dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) + assert dtype == np.int64 for dtypec in [np.float16, np.float32, np.float64]: data = dtypec(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) + assert dtype == dtypec data = np.float(12) dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) + assert dtype == np.float64 for data in [True, False]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) + assert dtype == np.bool_ for data in [np.complex64(1), np.complex128(1)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) + assert dtype == np.complex_ - import datetime for data in [np.datetime64(1, 'ns'), Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: + datetime(2000, 1, 1, 0, 0)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') + assert dtype == 'M8[ns]' for data in [np.timedelta64(1, 'ns'), Timedelta(1), - datetime.timedelta(1)]: + timedelta(1)]: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') + assert dtype == 'm8[ns]' - for data in [datetime.date(2000, 1, 1), + for data in [date(2000, 1, 1), Timestamp(1, tz='US/Eastern'), 'foo']: dtype, val = infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) + assert dtype == np.object_ + + @pytest.mark.parametrize( + "arr, expected", + [('foo', np.object_), + (b'foo', np.object_), + (1, np.int_), + (1.5, np.float_), + ([1], np.int_), + (np.array([1]), np.int_), + ([np.nan, 1, ''], np.object_), + (np.array([[1.0, 2.0]]), np.float_), + (Timestamp('20160101'), np.object_), + (np.datetime64('2016-01-01'), np.dtype('>> np.asarray([1, '1']) + array(['1', '1'], dtype='>> infer_dtype_from_array([1, '1']) + (numpy.object_, [1, '1']) + + """ + + if isinstance(arr, np.ndarray): + return arr.dtype, arr + + if not is_list_like(arr): + arr = [arr] + + # don't force numpy coerce with nan's + inferred = lib.infer_dtype(arr) + if inferred in ['string', 'bytes', 'unicode', + 'mixed', 'mixed-integer']: + return (np.object_, arr) + + arr = np.asarray(arr) + return arr.dtype, arr + + def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): """ provide explict type promotion and coercion From e6e4971dab3695506b5773b7c2855a224e1009b0 Mon Sep 17 00:00:00 2001 From: Carlos Souza Date: Tue, 28 Mar 2017 13:48:35 -0300 Subject: [PATCH 8/8] Adding replace unicode with number and replace mixed types with string tests --- pandas/tests/series/test_replace.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 07dff4f719531..5190eb110f4cf 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -227,9 +227,23 @@ def test_replace_with_empty_dictlike(self): tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) - def test_replace_string_with_nan(self): + def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) result = s.replace('2', np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) + + def test_replace_unicode_with_number(self): + # GH 15743 + s = pd.Series([1, 2, 3]) + result = s.replace(u'2', np.nan) + expected = pd.Series([1, 2, 3]) + tm.assert_series_equal(expected, result) + + def test_replace_mixed_types_with_string(self): + # Testing mixed + s = pd.Series([1, 2, 3, '4', 4, 5]) + result = s.replace([2, '4'], np.nan) + expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) + tm.assert_series_equal(expected, result)