Skip to content

ENH: Make maybe_convert_object respect dtype itemsize #40908

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Apr 21, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 40 additions & 27 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ from pandas._libs.util cimport (
INT64_MAX,
INT64_MIN,
UINT64_MAX,
get_itemsize,
is_nan,
)

Expand Down Expand Up @@ -2187,7 +2188,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

Parameters
----------
values : ndarray[object]
objects : ndarray[object]
Array of object elements to convert.
try_float : bool, default False
If an array-like object contains only float or NaN values is
Expand All @@ -2211,7 +2212,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
Array of converted object values to more specific dtypes if applicable.
"""
cdef:
Py_ssize_t i, n
Py_ssize_t i, n, itemsize_max = 0
ndarray[float64_t] floats
ndarray[complex128_t] complexes
ndarray[int64_t] ints
Expand Down Expand Up @@ -2244,6 +2245,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

for i in range(n):
val = objects[i]
if itemsize_max != -1:
itemsize = get_itemsize(val)
if itemsize > itemsize_max or itemsize == -1:
itemsize_max = itemsize

if val is None:
seen.null_ = True
Expand Down Expand Up @@ -2345,92 +2350,100 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
seen.object_ = True

if not seen.object_:
result = None
if not safe:
if seen.null_ or seen.nan_:
if seen.is_float_or_complex:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if convert_to_nullable_integer:
from pandas.core.arrays import IntegerArray
return IntegerArray(ints, mask)
result = IntegerArray(ints, mask)
itemsize_max = -1
else:
return floats
result = floats
elif seen.nan_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool:
return bools.view(np.bool_)
result = bools.view(np.bool_)

else:
# don't cast int to float, etc.
if seen.null_:
if seen.is_float_or_complex:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool and not seen.nan_:
return bools.view(np.bool_)
result = bools.view(np.bool_)
if result is not None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be tightened to something like result is floats or result is uints or result is ints? i.e. exclude datetimes/timedeltas/bools

if itemsize_max > 0:
curr_itemsize = cnp.PyArray_ITEMSIZE(result)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id just use result.dtype.itemsize and not bother with the C API.

if itemsize_max != curr_itemsize:
result = result.astype(result.dtype.kind + str(itemsize_max))
return result

return objects

Expand Down
23 changes: 23 additions & 0 deletions pandas/_libs/tslibs/util.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@

cimport numpy as cnp
from cpython.object cimport PyTypeObject
from numpy cimport PyArray_DescrFromScalar

cnp.import_array()


cdef extern from *:
Expand Down Expand Up @@ -44,6 +48,7 @@ cdef extern from "numpy/ndarrayobject.h":

bint PyArray_IsIntegerScalar(obj) nogil
bint PyArray_Check(obj) nogil
bint PyArray_CheckScalar(obj) nogil

cdef extern from "numpy/npy_common.h":
int64_t NPY_MIN_INT64
Expand Down Expand Up @@ -195,6 +200,24 @@ cdef inline bint is_nan(object val):
return is_complex_object(val) and val != val


cdef inline int64_t get_itemsize(object val):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is only used in lib.pyx, i think better to put it there

"""
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.

Parameters
----------
val : object

Returns
-------
is_ndarray : bool
"""
if PyArray_CheckScalar(val):
return PyArray_DescrFromScalar(val).itemsize
else:
return -1


cdef inline const char* get_c_string_buf_and_size(str py_string,
Py_ssize_t *length) except NULL:
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
result = DataFrame.from_records(tuples, exclude=exclude)
result.columns = [columns[i] for i in sorted(columns_to_test)]
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
tm.assert_series_equal(result["E1"], df["E1"])

def test_from_records_sequencelike_empty(self):
# empty case
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1929,7 +1929,7 @@ def test_constructor_for_list_with_dtypes(self):

df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
result = df.dtypes
expected = Series([np.dtype("int64")] * 5)
expected = Series([np.dtype("int32")] * 5)
tm.assert_series_equal(result, expected)

# overflow issue? (we always expected int64 upcasting here)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,7 @@ def max_value(group):

applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = Series(
[np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
index=["A", "B", "C", "D", "value"],
)
expected = df.dtypes
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
values = klass([True, False, True, True])
else:
values = klass(x * fill_val for x in [5, 6, 7, 8])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize(
Expand Down