From d07b23834381f4d23ff317c39ee83852c1eefbec Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 7 Jun 2018 13:43:19 -0500
Subject: [PATCH 1/5] REGR: NA-values in ctors with string dtype

```python
In [1]: import pandas as pd
In [2]: pd.Series([1, 2, None], dtype='str')[2]  # None

```

Closes #21083
---
 doc/source/whatsnew/v0.23.1.txt          |  8 ++++++
 pandas/core/series.py                    | 15 ++++++++++-
 pandas/tests/frame/test_constructors.py  |  8 ++++++
 pandas/tests/series/test_constructors.py | 33 +++++++++++++++++++-----
 4 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
index e29cb0a5a2626..6c5238a56be08 100644
--- a/doc/source/whatsnew/v0.23.1.txt
+++ b/doc/source/whatsnew/v0.23.1.txt
@@ -10,6 +10,14 @@ and bug fixes. We recommend that all users upgrade to this version.
     :local:
     :backlinks: none
 
+.. _whatsnew_0231.fixed_regressions:
+
+Fixed Regressions
+~~~~~~~~~~~~~~~~~
+
+- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
+
+
 .. _whatsnew_0231.enhancements:
 
 New features
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 8bd48c629ffef..02a4cbfce9929 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -4054,7 +4054,20 @@ def _try_cast(arr, take_fast_path):
                                            isinstance(subarr, np.ndarray))):
                 subarr = construct_1d_object_array_from_listlike(subarr)
             elif not is_extension_type(subarr):
-                subarr = np.array(subarr, dtype=dtype, copy=copy)
+                subarr2 = np.array(subarr, dtype=dtype, copy=copy)
+
+                if dtype and dtype.kind in ("U", "S"):
+                    # GH-21083
+                    # We can't just return np.array(subarr, dtype='str') since
+                    # NumPy will convert the non-string objects into strings
+                    # Including NA values. Se we have to go
+                    # string -> object -> update NA, which requires an
+                    # additional pass over the data.
+                    na_values = isna(subarr)
+                    subarr2 = subarr2.astype(object)
+                    subarr2[na_values] = np.asarray(subarr)[na_values]
+
+                subarr = subarr2
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
                 # We *do* allow casting to categorical, since we know
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 300e1acdea911..3a813ec6032fc 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -151,6 +151,14 @@ def test_constructor_complex_dtypes(self):
         assert a.dtype == df.a.dtype
         assert b.dtype == df.b.dtype
 
+    def test_constructor_dtype_str_na_values(self):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        df = DataFrame({'A': ['x', None]}, dtype=str)
+        result = df.isna()
+        expected = DataFrame({"A": [False, True]})
+        tm.assert_frame_equal(result, expected)
+        assert df.iloc[1, 0] is None
+
     def test_constructor_rec(self):
         rec = self.frame.to_records(index=False)
 
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 7e59325c32ddc..7a09e2abecb77 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -29,6 +29,17 @@
 from .common import TestData
 
 
+@pytest.fixture(params=[str, 'str', 'U'])
+def string_dtype(request):
+    """Parametrized fixture for string dtypes.
+
+    * str
+    * 'str'
+    * 'U'
+    """
+    return request.param
+
+
 class TestSeriesConstructors(TestData):
 
     def test_invalid_dtype(self):
@@ -137,6 +148,14 @@ def test_constructor_no_data_index_order(self):
         result = pd.Series(index=['b', 'a', 'c'])
         assert result.index.tolist() == ['b', 'a', 'c']
 
+    def test_constructor_dtype_str_na_values(self):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        ser = Series(['x', None], dtype=str)
+        result = ser.isna()
+        expected = Series([False, True])
+        tm.assert_series_equal(result, expected)
+        assert ser.iloc[1] is None
+
     def test_constructor_series(self):
         index1 = ['d', 'b', 'a', 'c']
         index2 = sorted(index1)
@@ -164,22 +183,24 @@ def test_constructor_list_like(self):
 
     @pytest.mark.parametrize('input_vals', [
         ([1, 2]),
-        ([1.0, 2.0, np.nan]),
         (['1', '2']),
         (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
         (list(pd.date_range('1/1/2011', periods=2, freq='H',
                             tz='US/Eastern'))),
         ([pd.Interval(left=0, right=5)]),
     ])
-    def test_constructor_list_str(self, input_vals):
+    def test_constructor_list_str(self, input_vals, string_dtype):
         # GH 16605
         # Ensure that data elements from a list are converted to strings
         # when dtype is str, 'str', or 'U'
+        result = Series(input_vals, dtype=string_dtype)
+        expected = Series(input_vals).astype(string_dtype)
+        assert_series_equal(result, expected)
 
-        for dtype in ['str', str, 'U']:
-            result = Series(input_vals, dtype=dtype)
-            expected = Series(input_vals).astype(dtype)
-            assert_series_equal(result, expected)
+    def test_constructor_list_str_na(self, string_dtype):
+        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
+        expected = Series(['1.0', '2.0', None], dtype=object)
+        assert_series_equal(result, expected)
 
     def test_constructor_generator(self):
         gen = (i for i in range(10))

From d2585e321f515205faf4a55f6080f4b12c2da0f0 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 7 Jun 2018 15:49:09 -0500
Subject: [PATCH 2/5] Compat for old numpy

when bool(dtype) was False
---
 pandas/core/series.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index 02a4cbfce9929..1506ae4db9d21 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -4056,7 +4056,7 @@ def _try_cast(arr, take_fast_path):
             elif not is_extension_type(subarr):
                 subarr2 = np.array(subarr, dtype=dtype, copy=copy)
 
-                if dtype and dtype.kind in ("U", "S"):
+                if dtype is not None and dtype.kind in ("U", "S"):
                     # GH-21083
                     # We can't just return np.array(subarr, dtype='str') since
                     # NumPy will convert the non-string objects into strings

From bcc993c2b87aeb63bc79ba72c0eb67bff139b0f8 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 7 Jun 2018 15:49:26 -0500
Subject: [PATCH 3/5] Additional tests

---
 pandas/conftest.py                       | 11 +++++++++++
 pandas/tests/frame/test_dtypes.py        | 16 ++++++++++------
 pandas/tests/series/test_constructors.py | 11 -----------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index a463f573c82e0..d5f399c7cd63d 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -159,3 +159,14 @@ def tz_aware_fixture(request):
     Fixture for trying explicit timezones: {0}
     """
     return request.param
+
+
+@pytest.fixture(params=[str, 'str', 'U'])
+def string_dtype(request):
+    """Parametrized fixture for string dtypes.
+
+    * str
+    * 'str'
+    * 'U'
+    """
+    return request.param
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
index 4c9f8c2ea0980..1eeeec0be3b8b 100644
--- a/pandas/tests/frame/test_dtypes.py
+++ b/pandas/tests/frame/test_dtypes.py
@@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):
 
     @pytest.mark.parametrize('input_vals', [
         ([1, 2]),
-        ([1.0, 2.0, np.nan]),
         (['1', '2']),
         (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
         (list(pd.date_range('1/1/2011', periods=2, freq='H',
                             tz='US/Eastern'))),
         ([pd.Interval(left=0, right=5)]),
     ])
-    def test_constructor_list_str(self, input_vals):
+    def test_constructor_list_str(self, input_vals, string_dtype):
         # GH 16605
         # Ensure that data elements are converted to strings when
         # dtype is str, 'str', or 'U'
 
-        for dtype in ['str', str, 'U']:
-            result = DataFrame({'A': input_vals}, dtype=dtype)
-            expected = DataFrame({'A': input_vals}).astype({'A': dtype})
-            assert_frame_equal(result, expected)
+        result = DataFrame({'A': input_vals}, dtype=string_dtype)
+        expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
+        assert_frame_equal(result, expected)
+
+    def test_constructor_list_str_na(self, string_dtype):
+
+        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
+        expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
+        assert_frame_equal(result, expected)
 
 
 class TestDataFrameDatetimeWithTZ(TestData):
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 7a09e2abecb77..1b77c6c5b2b4d 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -29,17 +29,6 @@
 from .common import TestData
 
 
-@pytest.fixture(params=[str, 'str', 'U'])
-def string_dtype(request):
-    """Parametrized fixture for string dtypes.
-
-    * str
-    * 'str'
-    * 'U'
-    """
-    return request.param
-
-
 class TestSeriesConstructors(TestData):
 
     def test_invalid_dtype(self):

From 0d7c85331029b61784a38f1b8deb926dffcf0833 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 7 Jun 2018 16:39:06 -0500
Subject: [PATCH 4/5] Additional test fixups

---
 pandas/core/series.py                    |  3 ++-
 pandas/tests/frame/test_constructors.py  |  7 +++++--
 pandas/tests/series/test_analytics.py    |  2 +-
 pandas/tests/series/test_constructors.py | 10 +++++++---
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index 1506ae4db9d21..d0be0d28560b8 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -4065,7 +4065,8 @@ def _try_cast(arr, take_fast_path):
                     # additional pass over the data.
                     na_values = isna(subarr)
                     subarr2 = subarr2.astype(object)
-                    subarr2[na_values] = np.asarray(subarr)[na_values]
+                    subarr2[na_values] = np.asarray(subarr,
+                                                    dtype=object)[na_values]
 
                 subarr = subarr2
         except (ValueError, TypeError):
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 3a813ec6032fc..a369c2567f621 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -151,14 +151,17 @@ def test_constructor_complex_dtypes(self):
         assert a.dtype == df.a.dtype
         assert b.dtype == df.b.dtype
 
-    def test_constructor_dtype_str_na_values(self):
+    def test_constructor_dtype_str_na_values(self, string_dtype):
         # https://github.com/pandas-dev/pandas/issues/21083
-        df = DataFrame({'A': ['x', None]}, dtype=str)
+        df = DataFrame({'A': ['x', None]}, dtype=, string_dtype)
         result = df.isna()
         expected = DataFrame({"A": [False, True]})
         tm.assert_frame_equal(result, expected)
         assert df.iloc[1, 0] is None
 
+        df = DataFrame({'A': ['x', np.nan]}, dtype=, string_dtype)
+        assert np.isnan(df.iloc[1, 0])
+
     def test_constructor_rec(self):
         rec = self.frame.to_records(index=False)
 
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index 14ae1ef42865a..aba472f2ce8f9 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
 
         data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]
 
-        s = Series(data, dtype=str)
+        s = Series(data, dtype=object).astype(str)
         result = s.mode(dropna)
         expected3 = Series(expected3, dtype=str)
         tm.assert_series_equal(result, expected3)
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 1b77c6c5b2b4d..906d2aacd5586 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -137,14 +137,17 @@ def test_constructor_no_data_index_order(self):
         result = pd.Series(index=['b', 'a', 'c'])
         assert result.index.tolist() == ['b', 'a', 'c']
 
-    def test_constructor_dtype_str_na_values(self):
+    def test_constructor_dtype_str_na_values(self, string_dtype):
         # https://github.com/pandas-dev/pandas/issues/21083
-        ser = Series(['x', None], dtype=str)
+        ser = Series(['x', None], dtype=string_dtype)
         result = ser.isna()
         expected = Series([False, True])
         tm.assert_series_equal(result, expected)
         assert ser.iloc[1] is None
 
+        ser = Series(['x', np.nan], dtype=string_dtype)
+        assert np.isnan(ser.iloc[1])
+
     def test_constructor_series(self):
         index1 = ['d', 'b', 'a', 'c']
         index2 = sorted(index1)
@@ -188,8 +191,9 @@ def test_constructor_list_str(self, input_vals, string_dtype):
 
     def test_constructor_list_str_na(self, string_dtype):
         result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
-        expected = Series(['1.0', '2.0', None], dtype=object)
+        expected = Series(['1.0', '2.0', np.nan], dtype=object)
         assert_series_equal(result, expected)
+        assert np.isnan(result[2])
 
     def test_constructor_generator(self):
         gen = (i for i in range(10))

From 3d81d5d094b638ae07aaf76264a84f0996b5bde9 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Fri, 8 Jun 2018 08:06:17 -0500
Subject: [PATCH 5/5] Refactored

---
 pandas/core/dtypes/cast.py              | 42 +++++++++++++++++++++++++
 pandas/core/series.py                   | 18 ++---------
 pandas/tests/dtypes/test_cast.py        | 13 ++++++++
 pandas/tests/frame/test_constructors.py |  4 +--
 4 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index e4ed6d544d42e..ebc7a13234a98 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values):
     result = np.empty(len(values), dtype='object')
     result[:] = values
     return result
+
+
+def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
+    """
+    Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
+
+    Parameters
+    ----------
+    values : Sequence
+    dtype : numpy.dtype, optional
+    copy : bool, default False
+        Note that copies may still be made with ``copy=False`` if casting
+        is required.
+
+    Returns
+    -------
+    arr : ndarray[dtype]
+
+    Examples
+    --------
+    >>> np.array([1.0, 2.0, None], dtype='str')
+    array(['1.0', '2.0', 'None'], dtype='<U4')
+
+    >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')
+
+
+    """
+    subarr = np.array(values, dtype=dtype, copy=copy)
+
+    if dtype is not None and dtype.kind in ("U", "S"):
+        # GH-21083
+        # We can't just return np.array(subarr, dtype='str') since
+        # NumPy will convert the non-string objects into strings
+        # Including NA values. Se we have to go
+        # string -> object -> update NA, which requires an
+        # additional pass over the data.
+        na_values = isna(values)
+        subarr2 = subarr.astype(object)
+        subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
+        subarr = subarr2
+
+    return subarr
diff --git a/pandas/core/series.py b/pandas/core/series.py
index d0be0d28560b8..3aa86a1a96f57 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -40,6 +40,7 @@
     maybe_convert_platform,
     maybe_cast_to_datetime, maybe_castable,
     construct_1d_arraylike_from_scalar,
+    construct_1d_ndarray_preserving_na,
     construct_1d_object_array_from_listlike)
 from pandas.core.dtypes.missing import (
     isna,
@@ -4054,21 +4055,8 @@ def _try_cast(arr, take_fast_path):
                                            isinstance(subarr, np.ndarray))):
                 subarr = construct_1d_object_array_from_listlike(subarr)
             elif not is_extension_type(subarr):
-                subarr2 = np.array(subarr, dtype=dtype, copy=copy)
-
-                if dtype is not None and dtype.kind in ("U", "S"):
-                    # GH-21083
-                    # We can't just return np.array(subarr, dtype='str') since
-                    # NumPy will convert the non-string objects into strings
-                    # Including NA values. Se we have to go
-                    # string -> object -> update NA, which requires an
-                    # additional pass over the data.
-                    na_values = isna(subarr)
-                    subarr2 = subarr2.astype(object)
-                    subarr2[na_values] = np.asarray(subarr,
-                                                    dtype=object)[na_values]
-
-                subarr = subarr2
+                subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
+                                                            copy=copy)
         except (ValueError, TypeError):
             if is_categorical_dtype(dtype):
                 # We *do* allow casting to categorical, since we know
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index 20cd8b43478d2..4a19682e2c558 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -23,6 +23,7 @@
     maybe_convert_scalar,
     find_common_type,
     construct_1d_object_array_from_listlike,
+    construct_1d_ndarray_preserving_na,
     construct_1d_arraylike_from_scalar)
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
@@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self):
         tm.assert_categorical_equal(result, expected,
                                     check_category_order=True,
                                     check_dtype=True)
+
+
+@pytest.mark.parametrize('values, dtype, expected', [
+    ([1, 2, 3], None, np.array([1, 2, 3])),
+    (np.array([1, 2, 3]), None, np.array([1, 2, 3])),
+    (['1', '2', None], None, np.array(['1', '2', None])),
+    (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
+    ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
+])
+def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
+    result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index a369c2567f621..e7fb765128738 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -153,13 +153,13 @@ def test_constructor_complex_dtypes(self):
 
     def test_constructor_dtype_str_na_values(self, string_dtype):
         # https://github.com/pandas-dev/pandas/issues/21083
-        df = DataFrame({'A': ['x', None]}, dtype=, string_dtype)
+        df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
         result = df.isna()
         expected = DataFrame({"A": [False, True]})
         tm.assert_frame_equal(result, expected)
         assert df.iloc[1, 0] is None
 
-        df = DataFrame({'A': ['x', np.nan]}, dtype=, string_dtype)
+        df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
         assert np.isnan(df.iloc[1, 0])
 
     def test_constructor_rec(self):