From d07b23834381f4d23ff317c39ee83852c1eefbec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 13:43:19 -0500 Subject: [PATCH 1/5] REGR: NA-values in ctors with string dtype ```python In [1]: import pandas as pd In [2]: pd.Series([1, 2, None], dtype='str')[2] # None ``` Closes #21083 --- doc/source/whatsnew/v0.23.1.txt | 8 ++++++ pandas/core/series.py | 15 ++++++++++- pandas/tests/frame/test_constructors.py | 8 ++++++ pandas/tests/series/test_constructors.py | 33 +++++++++++++++++++----- 4 files changed, 57 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index e29cb0a5a2626..6c5238a56be08 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -10,6 +10,14 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0231.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) + + .. _whatsnew_0231.enhancements: New features diff --git a/pandas/core/series.py b/pandas/core/series.py index 8bd48c629ffef..02a4cbfce9929 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4054,7 +4054,20 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = np.array(subarr, dtype=dtype, copy=copy) + subarr2 = np.array(subarr, dtype=dtype, copy=copy) + + if dtype and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(subarr) + subarr2 = subarr2.astype(object) + subarr2[na_values] = np.asarray(subarr)[na_values] + + subarr = subarr2 except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 300e1acdea911..3a813ec6032fc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,6 +151,14 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype + def test_constructor_dtype_str_na_values(self): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({'A': ['x', None]}, dtype=str) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7e59325c32ddc..7a09e2abecb77 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -29,6 +29,17 @@ from .common import TestData +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param + + class TestSeriesConstructors(TestData): def test_invalid_dtype(self): @@ -137,6 +148,14 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_dtype_str_na_values(self): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(['x', None], dtype=str) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -164,22 +183,24 @@ def test_constructor_list_like(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + assert_series_equal(result, expected) - for dtype in ['str', str, 'U']: - result = Series(input_vals, dtype=dtype) - expected = Series(input_vals).astype(dtype) - assert_series_equal(result, expected) + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(['1.0', '2.0', None], dtype=object) + assert_series_equal(result, expected) def test_constructor_generator(self): gen = (i for i in range(10)) From d2585e321f515205faf4a55f6080f4b12c2da0f0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 15:49:09 -0500 Subject: [PATCH 2/5] Compat for old numpy when bool(dtype) was False --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 02a4cbfce9929..1506ae4db9d21 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4056,7 +4056,7 @@ def _try_cast(arr, take_fast_path): elif not is_extension_type(subarr): subarr2 = np.array(subarr, dtype=dtype, copy=copy) - if dtype and dtype.kind in ("U", "S"): + if dtype is not None and dtype.kind in ("U", "S"): # GH-21083 # We can't just return np.array(subarr, dtype='str') since # NumPy will convert the non-string objects into strings From bcc993c2b87aeb63bc79ba72c0eb67bff139b0f8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 15:49:26 -0500 Subject: [PATCH 3/5] Additional tests --- pandas/conftest.py | 11 +++++++++++ pandas/tests/frame/test_dtypes.py | 16 ++++++++++------ pandas/tests/series/test_constructors.py | 11 ----------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a463f573c82e0..d5f399c7cd63d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -159,3 +159,14 @@ def tz_aware_fixture(request): Fixture for trying explicit timezones: {0} """ return request.param + + +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 4c9f8c2ea0980..1eeeec0be3b8b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - for dtype in ['str', str, 'U']: - result = DataFrame({'A': input_vals}, dtype=dtype) - expected = DataFrame({'A': input_vals}).astype({'A': dtype}) - assert_frame_equal(result, expected) + result = DataFrame({'A': input_vals}, dtype=string_dtype) + expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + assert_frame_equal(result, expected) class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7a09e2abecb77..1b77c6c5b2b4d 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -29,17 +29,6 @@ from .common import TestData -@pytest.fixture(params=[str, 'str', 'U']) -def string_dtype(request): - """Parametrized fixture for string dtypes. - - * str - * 'str' - * 'U' - """ - return request.param - - class TestSeriesConstructors(TestData): def test_invalid_dtype(self): From 0d7c85331029b61784a38f1b8deb926dffcf0833 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Jun 2018 16:39:06 -0500 Subject: [PATCH 4/5] Additional test fixups --- pandas/core/series.py | 3 ++- pandas/tests/frame/test_constructors.py | 7 +++++-- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_constructors.py | 10 +++++++--- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1506ae4db9d21..d0be0d28560b8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4065,7 +4065,8 @@ def _try_cast(arr, take_fast_path): # additional pass over the data. na_values = isna(subarr) subarr2 = subarr2.astype(object) - subarr2[na_values] = np.asarray(subarr)[na_values] + subarr2[na_values] = np.asarray(subarr, + dtype=object)[na_values] subarr = subarr2 except (ValueError, TypeError): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3a813ec6032fc..a369c2567f621 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,14 +151,17 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype - def test_constructor_dtype_str_na_values(self): + def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - df = DataFrame({'A': ['x', None]}, dtype=str) + df = DataFrame({'A': ['x', None]}, dtype=, string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None + df = DataFrame({'A': ['x', np.nan]}, dtype=, string_dtype) + assert np.isnan(df.iloc[1, 0]) + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 14ae1ef42865a..aba472f2ce8f9 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - s = Series(data, dtype=str) + s = Series(data, dtype=object).astype(str) result = s.mode(dropna) expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1b77c6c5b2b4d..906d2aacd5586 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,14 +137,17 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] - def test_constructor_dtype_str_na_values(self): + def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - ser = Series(['x', None], dtype=str) + ser = Series(['x', None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None + ser = Series(['x', np.nan], dtype=string_dtype) + assert np.isnan(ser.iloc[1]) + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -188,8 +191,9 @@ def test_constructor_list_str(self, input_vals, string_dtype): def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) - expected = Series(['1.0', '2.0', None], dtype=object) + expected = Series(['1.0', '2.0', np.nan], dtype=object) assert_series_equal(result, expected) + assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) From 3d81d5d094b638ae07aaf76264a84f0996b5bde9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Jun 2018 08:06:17 -0500 Subject: [PATCH 5/5] Refactored --- pandas/core/dtypes/cast.py | 42 +++++++++++++++++++++++++ pandas/core/series.py | 18 ++--------- pandas/tests/dtypes/test_cast.py | 13 ++++++++ pandas/tests/frame/test_constructors.py | 4 +-- 4 files changed, 60 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e4ed6d544d42e..ebc7a13234a98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values): result = np.empty(len(values), dtype='object') result[:] = values return result + + +def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): + """ + Construct a new ndarray, coercing `values` to `dtype`, preserving NA. + + Parameters + ---------- + values : Sequence + dtype : numpy.dtype, optional + copy : bool, default False + Note that copies may still be made with ``copy=False`` if casting + is required. + + Returns + ------- + arr : ndarray[dtype] + + Examples + -------- + >>> np.array([1.0, 2.0, None], dtype='str') + array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str') + + + """ + subarr = np.array(values, dtype=dtype, copy=copy) + + if dtype is not None and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(values) + subarr2 = subarr.astype(object) + subarr2[na_values] = np.asarray(values, dtype=object)[na_values] + subarr = subarr2 + + return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index d0be0d28560b8..3aa86a1a96f57 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -40,6 +40,7 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, construct_1d_object_array_from_listlike) from pandas.core.dtypes.missing import ( isna, @@ -4054,21 +4055,8 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr2 = np.array(subarr, dtype=dtype, copy=copy) - - if dtype is not None and dtype.kind in ("U", "S"): - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(subarr) - subarr2 = subarr2.astype(object) - subarr2[na_values] = np.asarray(subarr, - dtype=object)[na_values] - - subarr = subarr2 + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 20cd8b43478d2..4a19682e2c558 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,6 +23,7 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, + construct_1d_ndarray_preserving_na, construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) + + +@pytest.mark.parametrize('values, dtype, expected', [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (['1', '2', None], None, np.array(['1', '2', None])), + (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), + ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), +]) +def test_construct_1d_ndarray_preserving_na(values, dtype, expected): + result = construct_1d_ndarray_preserving_na(values, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a369c2567f621..e7fb765128738 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -153,13 +153,13 @@ def test_constructor_complex_dtypes(self): def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - df = DataFrame({'A': ['x', None]}, dtype=, string_dtype) + df = DataFrame({'A': ['x', None]}, dtype=string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None - df = DataFrame({'A': ['x', np.nan]}, dtype=, string_dtype) + df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) def test_constructor_rec(self):