From 81275a43764f0245710649e27a68191c08206fcd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 May 2023 10:19:35 -0700 Subject: [PATCH 1/2] Backport PR #53391: BUG: read_csv with dtype=bool[pyarrow] --- doc/source/whatsnew/v2.0.3.rst | 2 +- pandas/_libs/parsers.pyx | 6 +++++- pandas/io/parsers/base_parser.py | 3 ++- pandas/tests/extension/test_arrow.py | 15 ++++++++------- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 73779d7e4cc74..a25999d8085cb 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -20,7 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) .. --------------------------------------------------------------------------- .. _whatsnew_203.other: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2839730ca46bd..0bd0597f32ad0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -117,6 +117,8 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.inference import is_dict_like +from pandas.core.arrays.boolean import BooleanDtype + cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -1167,7 +1169,9 @@ cdef class TextReader: array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - if is_bool_dtype(dtype): + if isinstance(dtype, BooleanDtype): + # xref GH 47534: BooleanArray._from_sequence_of_strings has extra + # kwargs true_values = [x.decode() for x in self.true_values] false_values = [x.decode() for x in self.false_values] result = array_type._from_sequence_of_strings( diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2db759719fcb4..22ab8607e060a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -80,6 +80,7 @@ FloatingArray, IntegerArray, ) +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -800,7 +801,7 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi elif isinstance(cast_type, ExtensionDtype): array_type = cast_type.construct_array_type() try: - if is_bool_dtype(cast_type): + if isinstance(cast_type, BooleanDtype): # error: Unexpected keyword argument "true_values" for # "_from_sequence_of_strings" of "ExtensionArray" return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501 diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8907137c71844..5d38a6b0d0031 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -45,6 +45,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, is_float_dtype, @@ -738,14 +739,11 @@ def test_setitem_preserves_views(self, data): class TestBaseParsing(base.BaseParsingTests): + @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default]) @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, request): + def test_EA_types(self, engine, data, dtype_backend, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail(raises=TypeError, reason="GH 47534") - ) - elif pa.types.is_decimal(pa_dtype): + if pa.types.is_decimal(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=NotImplementedError, @@ -770,7 +768,10 @@ def test_EA_types(self, engine, data, request): else: csv_output = StringIO(csv_output) result = pd.read_csv( - csv_output, dtype={"with_dtype": str(data.dtype)}, engine=engine + csv_output, + dtype={"with_dtype": str(data.dtype)}, + engine=engine, + dtype_backend=dtype_backend, ) expected = df self.assert_frame_equal(result, expected) From c578723472c9ddcb1a6a50f2f2572b79855d5d82 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 31 May 2023 12:57:47 -0700 Subject: [PATCH 2/2] Add xfail --- pandas/tests/extension/test_arrow.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5d38a6b0d0031..3efb59fc6afce 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -761,6 +761,17 @@ def test_EA_types(self, engine, data, dtype_backend, request): request.node.add_marker( pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) + elif ( + pa.types.is_duration(pa_dtype) + and dtype_backend == "pyarrow" + and engine == "python" + ): + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason="Invalid type for timedelta scalar: NAType", + ) + ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) csv_output = df.to_csv(index=False, na_rep=np.nan) if pa.types.is_binary(pa_dtype):