From 314ed7194afd4faff4f06fda27f18d5c27a6b541 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 1 Apr 2021 20:20:24 -0400 Subject: [PATCH 01/12] ENH: Make maybe_convert_object respect dtype itemsize --- pandas/_libs/lib.pyx | 67 +++++++++++-------- pandas/_libs/tslibs/util.pxd | 23 +++++++ .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/groupby/test_groupby.py | 5 +- pandas/tests/indexing/test_coercion.py | 2 +- 6 files changed, 67 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e816bd4cd4026..d587c1491b662 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -77,6 +77,7 @@ from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, UINT64_MAX, + get_itemsize, is_nan, ) @@ -2187,7 +2188,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Parameters ---------- - values : ndarray[object] + objects : ndarray[object] Array of object elements to convert. try_float : bool, default False If an array-like object contains only float or NaN values is @@ -2211,7 +2212,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Array of converted object values to more specific dtypes if applicable. """ cdef: - Py_ssize_t i, n + Py_ssize_t i, n, itemsize_max = 0 ndarray[float64_t] floats ndarray[complex128_t] complexes ndarray[int64_t] ints @@ -2244,6 +2245,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, for i in range(n): val = objects[i] + if itemsize_max != -1: + itemsize = get_itemsize(val) + if itemsize == -1 or itemsize > itemsize_max: + itemsize_max = itemsize if val is None: seen.null_ = True @@ -2345,50 +2350,52 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, seen.object_ = True if not seen.object_: + result = None if not safe: if seen.null_ or seen.nan_: if seen.is_float_or_complex: if seen.complex_: - return complexes + result = complexes elif seen.float_: - return floats + result = floats elif seen.int_: if convert_to_nullable_integer: from pandas.core.arrays import IntegerArray - return IntegerArray(ints, mask) + result = IntegerArray(ints, mask) + itemsize_max = -1 else: - return floats + result = floats elif seen.nan_: - return floats + result = floats else: if not seen.bool_: if seen.datetime_: if not seen.numeric_ and not seen.timedelta_: - return datetimes + result = datetimes elif seen.timedelta_: if not seen.numeric_: - return timedeltas + result = timedeltas elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: # TODO: array full of NaT ambiguity resolve here needed pass elif convert_datetime: - return datetimes + result = datetimes elif convert_timedelta: - return timedeltas + result = timedeltas else: if seen.complex_: - return complexes + result = complexes elif seen.float_: - return floats + result = floats elif seen.int_: if seen.uint_: - return uints + result = uints else: - return ints + result = ints elif seen.is_bool: - return bools.view(np.bool_) + result = bools.view(np.bool_) else: # don't cast int to float, etc. @@ -2396,41 +2403,47 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, if seen.is_float_or_complex: if seen.complex_: if not seen.int_: - return complexes + result = complexes elif seen.float_ or seen.nan_: if not seen.int_: - return floats + result = floats else: if not seen.bool_: if seen.datetime_: if not seen.numeric_ and not seen.timedelta_: - return datetimes + result = datetimes elif seen.timedelta_: if not seen.numeric_: - return timedeltas + result = timedeltas elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: # TODO: array full of NaT ambiguity resolve here needed pass elif convert_datetime: - return datetimes + result = datetimes elif convert_timedelta: - return timedeltas + result = timedeltas else: if seen.complex_: if not seen.int_: - return complexes + result = complexes elif seen.float_ or seen.nan_: if not seen.int_: - return floats + result = floats elif seen.int_: if seen.uint_: - return uints + result = uints else: - return ints + result = ints elif seen.is_bool and not seen.nan_: - return bools.view(np.bool_) + result = bools.view(np.bool_) + if result is not None: + if itemsize > 0: + curr_itemsize = cnp.PyArray_ITEMSIZE(result) + if itemsize != curr_itemsize: + result = result.astype(result.dtype.kind + str(itemsize)) + return result return objects diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 150516aadffc6..d6bf9811fe650 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,5 +1,9 @@ +cimport numpy as cnp from cpython.object cimport PyTypeObject +from numpy cimport PyArray_DescrFromScalar + +cnp.import_array() cdef extern from *: @@ -44,6 +48,7 @@ cdef extern from "numpy/ndarrayobject.h": bint PyArray_IsIntegerScalar(obj) nogil bint PyArray_Check(obj) nogil + bint PyArray_CheckScalar(obj) nogil cdef extern from "numpy/npy_common.h": int64_t NPY_MIN_INT64 @@ -195,6 +200,24 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val +cdef inline int64_t get_itemsize(object val): + """ + Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar. + + Parameters + ---------- + val : object + + Returns + ------- + is_ndarray : bool + """ + if PyArray_CheckScalar(val): + return PyArray_DescrFromScalar(val).itemsize + else: + return -1 + + cdef inline const char* get_c_string_buf_and_size(str py_string, Py_ssize_t *length) except NULL: """ diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index e8d0a789e7cbd..35ad9f3e9693b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self): result = DataFrame.from_records(tuples, exclude=exclude) result.columns = [columns[i] for i in sorted(columns_to_test)] tm.assert_series_equal(result["C"], df["C"]) - tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + tm.assert_series_equal(result["E1"], df["E1"]) def test_from_records_sequencelike_empty(self): # empty case diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ca68885fdc470..966467dd878e2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1929,7 +1929,7 @@ def test_constructor_for_list_with_dtypes(self): df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int32")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expected int64 upcasting here) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6c51e32fa9a78..fd135e16915d3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -99,10 +99,7 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = Series( - [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], - index=["A", "B", "C", "D", "value"], - ) + expected = df.dtypes tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 7642f78076dcb..2bb9b51df2285 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype): values = klass([True, False, True, True]) else: values = klass(x * fill_val for x in [5, 6, 7, 8]) - exp = klass([1 + 1j, values[1], 3 + 3j, values[3]]) + exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) @pytest.mark.parametrize( From a8008d1c2623c9036497dd52abf8ea52bd901351 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Apr 2021 18:20:41 -0400 Subject: [PATCH 02/12] fixup --- pandas/_libs/lib.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d587c1491b662..1a72190c8bda7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2439,10 +2439,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, elif seen.is_bool and not seen.nan_: result = bools.view(np.bool_) if result is not None: - if itemsize > 0: + if itemsize_max > 0: curr_itemsize = cnp.PyArray_ITEMSIZE(result) - if itemsize != curr_itemsize: - result = result.astype(result.dtype.kind + str(itemsize)) + if itemsize_max != curr_itemsize: + result = result.astype(result.dtype.kind + str(itemsize_max)) return result return objects From 83c3ab5757ad8c77d7a750b9c808fb06252e884b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Apr 2021 18:22:01 -0400 Subject: [PATCH 03/12] fixup --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1a72190c8bda7..9e4eb601af944 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2247,7 +2247,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, val = objects[i] if itemsize_max != -1: itemsize = get_itemsize(val) - if itemsize == -1 or itemsize > itemsize_max: + if itemsize > itemsize_max or itemsize == -1: itemsize_max = itemsize if val is None: From 64086a458dfd822d424ed83280b06ac475f96617 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 12 Apr 2021 18:45:08 -0400 Subject: [PATCH 04/12] Move get_itemsize, use dtype.itemsize, safer casting --- pandas/_libs/lib.pyx | 33 ++++++++++++++++++++++++++------- pandas/_libs/tslibs/util.pxd | 23 ----------------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9e4eb601af944..6eb27bc54cc10 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names +cdef extern from "numpy/ndarrayobject.h": + bint PyArray_CheckScalar(obj) nogil + cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -77,7 +80,6 @@ from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, UINT64_MAX, - get_itemsize, is_nan, ) @@ -210,6 +212,24 @@ def is_scalar(val: object) -> bool: or is_offset_object(val)) +cdef inline int64_t get_itemsize(object val): + """ + Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar. + + Parameters + ---------- + val : object + + Returns + ------- + is_ndarray : bool + """ + if PyArray_CheckScalar(val): + return cnp.PyArray_DescrFromScalar(val).itemsize + else: + return -1 + + def is_iterator(obj: object) -> bool: """ Check if the object is an iterator. @@ -2362,7 +2382,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, if convert_to_nullable_integer: from pandas.core.arrays import IntegerArray result = IntegerArray(ints, mask) - itemsize_max = -1 else: result = floats elif seen.nan_: @@ -2438,11 +2457,11 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, result = ints elif seen.is_bool and not seen.nan_: result = bools.view(np.bool_) - if result is not None: - if itemsize_max > 0: - curr_itemsize = cnp.PyArray_ITEMSIZE(result) - if itemsize_max != curr_itemsize: - result = result.astype(result.dtype.kind + str(itemsize_max)) + if result is uints or result is ints or result is floats or result is complexes: + if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: + result = result.astype(result.dtype.kind + str(itemsize_max)) + return result + elif result is not None: return result return objects diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index d6bf9811fe650..150516aadffc6 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,9 +1,5 @@ -cimport numpy as cnp from cpython.object cimport PyTypeObject -from numpy cimport PyArray_DescrFromScalar - -cnp.import_array() cdef extern from *: @@ -48,7 +44,6 @@ cdef extern from "numpy/ndarrayobject.h": bint PyArray_IsIntegerScalar(obj) nogil bint PyArray_Check(obj) nogil - bint PyArray_CheckScalar(obj) nogil cdef extern from "numpy/npy_common.h": int64_t NPY_MIN_INT64 @@ -200,24 +195,6 @@ cdef inline bint is_nan(object val): return is_complex_object(val) and val != val -cdef inline int64_t get_itemsize(object val): - """ - Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar. - - Parameters - ---------- - val : object - - Returns - ------- - is_ndarray : bool - """ - if PyArray_CheckScalar(val): - return PyArray_DescrFromScalar(val).itemsize - else: - return -1 - - cdef inline const char* get_c_string_buf_and_size(str py_string, Py_ssize_t *length) except NULL: """ From 24044de54fedead17bfbe7d1630705548e05ced5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 13 Apr 2021 22:34:11 -0400 Subject: [PATCH 05/12] Fix Series.count and tests --- pandas/core/series.py | 2 +- pandas/tests/extension/test_sparse.py | 7 ------- pandas/tests/frame/test_constructors.py | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5ba68aaa5c16d..c746371cc3434 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1893,7 +1893,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(self._values).sum() + return notna(self._values).sum().astype("int64") elif not isinstance(self.index, MultiIndex): raise ValueError("Series.count level is only valid with a MultiIndex") diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 759277a47f62b..f0d3fb7ff9e1b 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -16,10 +16,6 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - is_platform_windows, -) from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import is_object_dtype @@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") request.node.add_marker(mark) - elif is_platform_windows() or not IS64: - mark = pytest.mark.xfail(reason="results are int32, expected int64") - request.node.add_marker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 966467dd878e2..c565567754da0 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1924,7 +1924,7 @@ def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int")] * 5) tm.assert_series_equal(result, expected) df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) From 5950138220843131717605e1c840e0f46ae27695 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 13 Apr 2021 23:52:06 -0400 Subject: [PATCH 06/12] WIP --- pandas/tests/dtypes/test_inference.py | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..41b4974db132f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -614,6 +614,34 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize("data0", [True, np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1)]) + @pytest.mark.parametrize("data1", + [True, np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), + np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1)]) + def test_maybe_convert_objects_itemsize(self, request, data0, data1): + if hasattr(data0, "dtype") and hasattr(data1, "dtype") and (data0.dtype.kind == 'u' or data1.dtype.kind == 'u'): + if data0.dtype.kind == 'u' and data1.dtype.kind == 'u': + request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) + # elif data0.dtype.kind == 'i' and data0.dtype.itemsize < data1.dtype.itemsize: + # request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) + elif data1.dtype.kind == 'i' and data1.dtype.itemsize < data0.dtype.itemsize: + request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) + data = [data0, data1] + arr = np.array(data, dtype="object") + expected = np.array(data) + if data0 is True or data1 is True: + if data0 is True and data1 is True: + expected_dtype = "bool" + else: + expected_dtype = "object" + else: + expected_dtype = f'{expected.dtype.kind}{max(data0.dtype.itemsize, data1.dtype.itemsize)}' + expected = expected.astype(expected_dtype) + + result = lib.maybe_convert_objects(arr) + print(type(data0), type(data1)) + tm.assert_numpy_array_equal(result, expected) + def test_mixed_dtypes_remain_object_array(self): # GH14956 arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) From 45f5f31e703c869b1c784800b961de923b91f05d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 14 Apr 2021 00:56:45 -0400 Subject: [PATCH 07/12] Added test --- pandas/tests/dtypes/test_inference.py | 75 +++++++++++++++++++-------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 41b4974db132f..9c09e11c88d85 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -614,32 +614,65 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) - @pytest.mark.parametrize("data0", [True, np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1)]) - @pytest.mark.parametrize("data1", - [True, np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1), np.int8(1), np.int16(1), - np.int32(1), np.int64(1), np.float16(1), np.float32(1), np.float64(1)]) - def test_maybe_convert_objects_itemsize(self, request, data0, data1): - if hasattr(data0, "dtype") and hasattr(data1, "dtype") and (data0.dtype.kind == 'u' or data1.dtype.kind == 'u'): - if data0.dtype.kind == 'u' and data1.dtype.kind == 'u': - request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) - # elif data0.dtype.kind == 'i' and data0.dtype.itemsize < data1.dtype.itemsize: - # request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) - elif data1.dtype.kind == 'i' and data1.dtype.itemsize < data0.dtype.itemsize: - request.node.add_marker(pytest.mark.xfail(reason="uints not handled correctly")) + @pytest.mark.parametrize( + "data0", + [ + True, + 1, + 1.0, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.float128(1), + np.complex64(1), + np.complex128(1), + np.complex256(1), + ], + ) + @pytest.mark.parametrize( + "data1", + [ + True, + 1, + 1.0, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.float128(1), + np.complex64(1), + np.complex128(1), + np.complex256(1), + ], + ) + def test_maybe_convert_objects_itemsize(self, data0, data1): data = [data0, data1] arr = np.array(data, dtype="object") - expected = np.array(data) - if data0 is True or data1 is True: - if data0 is True and data1 is True: - expected_dtype = "bool" - else: - expected_dtype = "object" + + kind = np.find_common_type([type(data0), type(data1)], scalar_types=[]).kind + is_nptype0 = hasattr(data0, "dtype") + is_nptype1 = hasattr(data1, "dtype") + if is_nptype0 and is_nptype1: + itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize) + elif data0 is True or data1 is True: + kind = "bool" if (data0 is True and data1 is True) else "object" + itemsize = "" + elif not is_nptype0 and is_nptype1: + itemsize = 16 if data1.dtype.kind == "c" else 8 + elif is_nptype0 and not is_nptype1: + itemsize = 16 if data0.dtype.kind == "c" else 8 else: - expected_dtype = f'{expected.dtype.kind}{max(data0.dtype.itemsize, data1.dtype.itemsize)}' - expected = expected.astype(expected_dtype) + itemsize = 8 + expected = np.array(data, dtype=f"{kind}{itemsize}") result = lib.maybe_convert_objects(arr) - print(type(data0), type(data1)) tm.assert_numpy_array_equal(result, expected) def test_mixed_dtypes_remain_object_array(self): From 8e1e7c2dcaddc3be3ffe838996913ff995a33b7e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 14 Apr 2021 23:41:38 -0400 Subject: [PATCH 08/12] Fixed replace --- pandas/tests/dtypes/test_inference.py | 29 ++++++++++++++-------- pandas/tests/frame/methods/test_replace.py | 10 +++++++- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 9c09e11c88d85..2a2875ac29fce 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.common import ( ensure_int32, is_bool, + is_complex, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, @@ -620,6 +621,7 @@ def test_maybe_convert_objects_bool_nan(self): True, 1, 1.0, + 1.0 + 1.0j, np.int8(1), np.int16(1), np.int32(1), @@ -639,6 +641,7 @@ def test_maybe_convert_objects_bool_nan(self): True, 1, 1.0, + 1.0 + 1.0j, np.int8(1), np.int16(1), np.int32(1), @@ -653,25 +656,29 @@ def test_maybe_convert_objects_bool_nan(self): ], ) def test_maybe_convert_objects_itemsize(self, data0, data1): + # GH 40908 data = [data0, data1] arr = np.array(data, dtype="object") - kind = np.find_common_type([type(data0), type(data1)], scalar_types=[]).kind - is_nptype0 = hasattr(data0, "dtype") - is_nptype1 = hasattr(data1, "dtype") - if is_nptype0 and is_nptype1: + common_kind = np.find_common_type( + [type(data0), type(data1)], scalar_types=[] + ).kind + kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind + kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind + if kind0 != "python" and kind1 != "python": + kind = common_kind itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize) - elif data0 is True or data1 is True: - kind = "bool" if (data0 is True and data1 is True) else "object" + elif is_bool(data0) or is_bool(data1): + kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object" itemsize = "" - elif not is_nptype0 and is_nptype1: - itemsize = 16 if data1.dtype.kind == "c" else 8 - elif is_nptype0 and not is_nptype1: - itemsize = 16 if data0.dtype.kind == "c" else 8 + elif is_complex(data0) or is_complex(data1): + kind = common_kind + itemsize = 16 else: + kind = common_kind itemsize = 8 - expected = np.array(data, dtype=f"{kind}{itemsize}") + expected = np.array(data, dtype=f"{kind}{itemsize}") result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index d8f93f047e74b..3400ed1e14e06 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p19 + import pandas as pd from pandas import ( DataFrame, @@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, replacer): + def test_replace_replacer_dtype(self, request, replacer): # GH26632 + if np.isscalar(replacer) and replacer.dtype.itemsize < 8: + request.node.add_marker( + pytest.mark.xfail( + np_version_under1p19, reason="np.putmask doesn't coerce dtype" + ) + ) df = DataFrame(["a"]) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) From f13edc017a528f44e753a07625adbcb6722dccb3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 14 Apr 2021 23:57:06 -0400 Subject: [PATCH 09/12] Remove higher itemsize for earlier versions of numpy --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/dtypes/test_inference.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ea7e0f88ff81e..bf4952dbae84b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -652,7 +652,7 @@ Numeric - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) -- +- Bug in :meth:`Series.count` would result in ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 2a2875ac29fce..d1e6409307915 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -629,10 +629,8 @@ def test_maybe_convert_objects_bool_nan(self): np.float16(1), np.float32(1), np.float64(1), - np.float128(1), np.complex64(1), np.complex128(1), - np.complex256(1), ], ) @pytest.mark.parametrize( @@ -649,10 +647,8 @@ def test_maybe_convert_objects_bool_nan(self): np.float16(1), np.float32(1), np.float64(1), - np.float128(1), np.complex64(1), np.complex128(1), - np.complex256(1), ], ) def test_maybe_convert_objects_itemsize(self, data0, data1): From 5d3ffe0ade9bc0c00a0cb0e0c02c24e619f58789 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 14 Apr 2021 23:57:35 -0400 Subject: [PATCH 10/12] fixup --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index bf4952dbae84b..3b18afeadee21 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -652,7 +652,7 @@ Numeric - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) -- Bug in :meth:`Series.count` would result in ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) +- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) Conversion ^^^^^^^^^^ From c1288962944716a214ac4511376b075367a27fbc Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 15 Apr 2021 00:44:32 -0400 Subject: [PATCH 11/12] np_version_under1p19 -> np_version_under1p20 --- pandas/tests/frame/methods/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3400ed1e14e06..e6ed60dc2bb08 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,7 +10,7 @@ import numpy as np import pytest -from pandas.compat import np_version_under1p19 +from pandas.compat import np_version_under1p20 import pandas as pd from pandas import ( @@ -1521,7 +1521,7 @@ def test_replace_replacer_dtype(self, request, replacer): if np.isscalar(replacer) and replacer.dtype.itemsize < 8: request.node.add_marker( pytest.mark.xfail( - np_version_under1p19, reason="np.putmask doesn't coerce dtype" + np_version_under1p20, reason="np.putmask doesn't coerce dtype" ) ) df = DataFrame(["a"]) From c03fbf6cb2bc4ded895bae24cd2df50ebe60858d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 19 Apr 2021 16:37:11 -0400 Subject: [PATCH 12/12] whatsnew and comment --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/_libs/lib.pyx | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 3b18afeadee21..363dbcbbed76e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -219,7 +219,7 @@ Other enhancements - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) -- +- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6eb27bc54cc10..53fb286f611ea 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2457,7 +2457,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, result = ints elif seen.is_bool and not seen.nan_: result = bools.view(np.bool_) + if result is uints or result is ints or result is floats or result is complexes: + # cast to the largest itemsize when all values are NumPy scalars if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: result = result.astype(result.dtype.kind + str(itemsize_max)) return result