pandas-dev · jreback · Apr 21, 2021 · Apr 2, 2021 · Apr 12, 2021 · Apr 12, 2021
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -77,6 +77,7 @@ from pandas._libs.util cimport (
     INT64_MAX,
     INT64_MIN,
     UINT64_MAX,
+    get_itemsize,
     is_nan,
 )
 
@@ -2187,7 +2188,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
     Parameters
     ----------
-    values : ndarray[object]
+    objects : ndarray[object]
         Array of object elements to convert.
     try_float : bool, default False
         If an array-like object contains only float or NaN values is
@@ -2211,7 +2212,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
         Array of converted object values to more specific dtypes if applicable.
     """
     cdef:
-        Py_ssize_t i, n
+        Py_ssize_t i, n, itemsize_max = 0
         ndarray[float64_t] floats
         ndarray[complex128_t] complexes
         ndarray[int64_t] ints
@@ -2244,6 +2245,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
 
     for i in range(n):
         val = objects[i]
+        if itemsize_max != -1:
+            itemsize = get_itemsize(val)
+            if itemsize > itemsize_max or itemsize == -1:
+                itemsize_max = itemsize
 
         if val is None:
             seen.null_ = True
@@ -2345,92 +2350,100 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
         seen.object_ = True
 
     if not seen.object_:
+        result = None
         if not safe:
             if seen.null_ or seen.nan_:
                 if seen.is_float_or_complex:
                     if seen.complex_:
-                        return complexes
+                        result = complexes
                     elif seen.float_:
-                        return floats
+                        result = floats
                     elif seen.int_:
                         if convert_to_nullable_integer:
                             from pandas.core.arrays import IntegerArray
-                            return IntegerArray(ints, mask)
+                            result = IntegerArray(ints, mask)
+                            itemsize_max = -1
                         else:
-                            return floats
+                            result = floats
                     elif seen.nan_:
-                        return floats
+                        result = floats
             else:
                 if not seen.bool_:
                     if seen.datetime_:
                         if not seen.numeric_ and not seen.timedelta_:
-                            return datetimes
+                            result = datetimes
                     elif seen.timedelta_:
                         if not seen.numeric_:
-                            return timedeltas
+                            result = timedeltas
                     elif seen.nat_:
                         if not seen.numeric_:
                             if convert_datetime and convert_timedelta:
                                 # TODO: array full of NaT ambiguity resolve here needed
                                 pass
                             elif convert_datetime:
-                                return datetimes
+                                result = datetimes
                             elif convert_timedelta:
-                                return timedeltas
+                                result = timedeltas
                     else:
                         if seen.complex_:
-                            return complexes
+                            result = complexes
                         elif seen.float_:
-                            return floats
+                            result = floats
                         elif seen.int_:
                             if seen.uint_:
-                                return uints
+                                result = uints
                             else:
-                                return ints
+                                result = ints
                 elif seen.is_bool:
-                    return bools.view(np.bool_)
+                    result = bools.view(np.bool_)
 
         else:
             # don't cast int to float, etc.
             if seen.null_:
                 if seen.is_float_or_complex:
                     if seen.complex_:
                         if not seen.int_:
-                            return complexes
+                            result = complexes
                     elif seen.float_ or seen.nan_:
                         if not seen.int_:
-                            return floats
+                            result = floats
             else:
                 if not seen.bool_:
                     if seen.datetime_:
                         if not seen.numeric_ and not seen.timedelta_:
-                            return datetimes
+                            result = datetimes
                     elif seen.timedelta_:
                         if not seen.numeric_:
-                            return timedeltas
+                            result = timedeltas
                     elif seen.nat_:
                         if not seen.numeric_:
                             if convert_datetime and convert_timedelta:
                                 # TODO: array full of NaT ambiguity resolve here needed
                                 pass
                             elif convert_datetime:
-                                return datetimes
+                                result = datetimes
                             elif convert_timedelta:
-                                return timedeltas
+                                result = timedeltas
                     else:
                         if seen.complex_:
                             if not seen.int_:
-                                return complexes
+                                result = complexes
                         elif seen.float_ or seen.nan_:
                             if not seen.int_:
-                                return floats
+                                result = floats
                         elif seen.int_:
                             if seen.uint_:
-                                return uints
+                                result = uints
                             else:
-                                return ints
+                                result = ints
                 elif seen.is_bool and not seen.nan_:
-                    return bools.view(np.bool_)
+                    result = bools.view(np.bool_)
+        if result is not None:
+            if itemsize_max > 0:
+                curr_itemsize = cnp.PyArray_ITEMSIZE(result)
+                if itemsize_max != curr_itemsize:
+                    result = result.astype(result.dtype.kind + str(itemsize_max))
+            return result
 
     return objects
 

diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -1,5 +1,9 @@
 
+cimport numpy as cnp
 from cpython.object cimport PyTypeObject
+from numpy cimport PyArray_DescrFromScalar
+
+cnp.import_array()
 
 
 cdef extern from *:
@@ -44,6 +48,7 @@ cdef extern from "numpy/ndarrayobject.h":
 
     bint PyArray_IsIntegerScalar(obj) nogil
     bint PyArray_Check(obj) nogil
+    bint PyArray_CheckScalar(obj) nogil
 
 cdef extern from "numpy/npy_common.h":
     int64_t NPY_MIN_INT64
@@ -195,6 +200,24 @@ cdef inline bint is_nan(object val):
     return is_complex_object(val) and val != val
 
 
+cdef inline int64_t get_itemsize(object val):
+    """
+    Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    is_ndarray : bool
+    """
+    if PyArray_CheckScalar(val):
+        return PyArray_DescrFromScalar(val).itemsize
+    else:
+        return -1
+
+
 cdef inline const char* get_c_string_buf_and_size(str py_string,
                                                   Py_ssize_t *length) except NULL:
     """

diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
@@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
         result = DataFrame.from_records(tuples, exclude=exclude)
         result.columns = [columns[i] for i in sorted(columns_to_test)]
         tm.assert_series_equal(result["C"], df["C"])
-        tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
+        tm.assert_series_equal(result["E1"], df["E1"])
 
     def test_from_records_sequencelike_empty(self):
         # empty case

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -1929,7 +1929,7 @@ def test_constructor_for_list_with_dtypes(self):
 
         df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
         result = df.dtypes
-        expected = Series([np.dtype("int64")] * 5)
+        expected = Series([np.dtype("int32")] * 5)
         tm.assert_series_equal(result, expected)
 
         # overflow issue? (we always expected int64 upcasting here)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -99,10 +99,7 @@ def max_value(group):
 
     applied = df.groupby("A").apply(max_value)
     result = applied.dtypes
-    expected = Series(
-        [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
-        index=["A", "B", "C", "D", "value"],
-    )
+    expected = df.dtypes
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
             values = klass([True, False, True, True])
         else:
             values = klass(x * fill_val for x in [5, 6, 7, 8])
-        exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
+        exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
         self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
 
     @pytest.mark.parametrize(