Skip to content

String-based ExtensionArrays force cast to numpy in list based construction #27673

Closed
@xhochy

Description

@xhochy

Code Sample, a copy-pastable example if possible

pd.Series(['E'], dtype=FletcherDType(pa.string())

Problem description

This leads to the following traceback but should actually create a Series object of a string-based type.


    def test_constructor_sanitation():
>       data = pd.Series(["E"], index=["F"], dtype=ArrowStringDtype())

pandas/tests/extension/arrow/test_string.py:11:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pandas/core/series.py:310: in __init__
    data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

data = ['E'], index = Index(['F'], dtype='object'), dtype = <pandas.tests.extension.arrow.arrays.ArrowStringDtype object at 0x120913cf8>, copy = False, raise_cast_failure = True

    def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False):
        """
        Sanitize input data to an ndarray, copy if specified, coerce to the
        dtype if specified.
        """
        if dtype is not None:
            dtype = pandas_dtype(dtype)

        if isinstance(data, ma.MaskedArray):
            mask = ma.getmaskarray(data)
            if mask.any():
                data, fill_value = maybe_upcast(data, copy=True)
                data.soften_mask()  # set hardmask False if it was True
                data[mask] = fill_value
            else:
                data = data.copy()

        # extract ndarray or ExtensionArray, ensure we have no PandasArray
        data = extract_array(data, extract_numpy=True)

        # GH#846
        if isinstance(data, np.ndarray):

            if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
                # possibility of nan -> garbage
                try:
                    subarr = _try_cast(data, dtype, copy, True)
                except ValueError:
                    if copy:
                        subarr = data.copy()
                    else:
                        subarr = np.array(data, copy=False)
            else:
                # we will try to copy be-definition here
                subarr = _try_cast(data, dtype, copy, raise_cast_failure)

        elif isinstance(data, ABCExtensionArray):
            # it is already ensured above this is not a PandasArray
            subarr = data

            if dtype is not None:
                subarr = subarr.astype(dtype, copy=copy)
            elif copy:
                subarr = subarr.copy()
            return subarr

        elif isinstance(data, (list, tuple)) and len(data) > 0:
            if dtype is not None:
                try:
                    subarr = _try_cast(data, dtype, copy, raise_cast_failure)
                except Exception:
                    if raise_cast_failure:  # pragma: no cover
                        raise
                    subarr = np.array(data, dtype=object, copy=copy)
                    subarr = lib.maybe_convert_objects(subarr)

            else:
                subarr = maybe_convert_platform(data)

            subarr = maybe_cast_to_datetime(subarr, dtype)

        elif isinstance(data, range):
            # GH#16804
            arr = np.arange(data.start, data.stop, data.step, dtype="int64")
            subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
        else:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

        # scalar like, GH
        if getattr(subarr, "ndim", 0) == 0:
            if isinstance(data, list):  # pragma: no cover
                subarr = np.array(data, dtype=object)
            elif index is not None:
                value = data

                # figure out the dtype from the value (upcast if necessary)
                if dtype is None:
                    dtype, value = infer_dtype_from_scalar(value)
                else:
                    # need to possibly convert the value here
                    value = maybe_cast_to_datetime(value, dtype)

                subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)

            else:
                return subarr.item()

        # the result that we want
        elif subarr.ndim == 1:
            if index is not None:

                # a 1-element ndarray
                if len(subarr) != len(index) and len(subarr) == 1:
                    subarr = construct_1d_arraylike_from_scalar(
                        subarr[0], len(index), subarr.dtype
                    )

        elif subarr.ndim > 1:
            if isinstance(data, np.ndarray):
                raise Exception("Data must be 1-dimensional")
            else:
                subarr = com.asarray_tuplesafe(data, dtype=dtype)

        # This is to prevent mixed-type Series getting all casted to
        # NumPy string type, e.g. NaN --> '-1#IND'.
        if issubclass(subarr.dtype.type, str):
            import ipdb; ipdb.set_trace()
            # GH#16605
            # If not empty convert the data to dtype
            # GH#19853: If data is a scalar, subarr has already the result
>           if not lib.is_scalar(data):
E           TypeError: data type not understood

pandas/core/construction.py:478: TypeError

Problem exists in 0.25 and master, I'm working on a fix and will submit a PR later.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions