From 0828f6192e0744b4e4602b6cae54ba4f00ddb97a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 May 2023 14:05:34 -0700 Subject: [PATCH 1/4] BUG: read_csv with dtype=bool[pyarrow] --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/_libs/parsers.pyx | 6 +++++- pandas/io/parsers/base_parser.py | 3 ++- pandas/tests/io/parser/dtypes/test_dtypes_basic.py | 14 ++++++++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 33139f0c1bacf..5a9b6ddee5bde 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -31,6 +31,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) +- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` (:issue:`53390`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a45299c8ba896..8b23c80afee31 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -111,6 +111,8 @@ from pandas.core.dtypes.dtypes import ( ) from pandas.core.dtypes.inference import is_dict_like +from pandas.core.arrays.boolean import BooleanDtype + cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -1194,7 +1196,9 @@ cdef class TextReader: array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - if dtype.kind == "b": + if isinstance(dtype, BooleanDtype): + # xref GH 47534: BooleanArray._from_sequence_of_strings has extra + # kwargs true_values = [x.decode() for x in self.true_values] false_values = [x.decode() for x in self.false_values] result = array_type._from_sequence_of_strings( diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index cb8cccebb6978..f79b7ebe7efef 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -75,6 +75,7 @@ FloatingArray, IntegerArray, ) +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexes.api import ( Index, MultiIndex, @@ -809,7 +810,7 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi elif isinstance(cast_type, ExtensionDtype): array_type = cast_type.construct_array_type() try: - if is_bool_dtype(cast_type): + if isinstance(cast_type, BooleanDtype): # error: Unexpected keyword argument "true_values" for # "_from_sequence_of_strings" of "ExtensionArray" return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa: E501 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 915cc9a9a1f95..873dddc9a4d10 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -538,3 +538,17 @@ def test_ea_int_avoid_overflow(all_parsers): } ) tm.assert_frame_equal(result, expected) + + +def test_dtype_bool_pyarrow(all_parsers): + # GH 53390 + pytest.importorskip("pyarrow") + parser = all_parsers + data = """col +True +False +True +""" + result = parser.read_csv(StringIO(data), dtype={"col": "bool[pyarrow]"}) + expected = DataFrame({"col": [True, False, True]}, dtype="bool[pyarrow]") + tm.assert_frame_equal(result, expected) From 7c71bd5cc0fd3fb3040e1199d88fb5f89d9074fb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 May 2023 15:59:40 -0700 Subject: [PATCH 2/4] Use existing test instead --- pandas/tests/extension/test_arrow.py | 15 ++++++++------- .../tests/io/parser/dtypes/test_dtypes_basic.py | 14 -------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9129e84700a55..a0d15c70b5720 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -47,6 +47,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.extensions import no_default from pandas.api.types import ( is_bool_dtype, is_float_dtype, @@ -723,14 +724,11 @@ def test_setitem_preserves_views(self, data): class TestBaseParsing(base.BaseParsingTests): + @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default]) @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, request): + def test_EA_types(self, engine, data, dtype_backend, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_boolean(pa_dtype): - request.node.add_marker( - pytest.mark.xfail(raises=TypeError, reason="GH 47534") - ) - elif pa.types.is_decimal(pa_dtype): + if pa.types.is_decimal(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=NotImplementedError, @@ -755,7 +753,10 @@ def test_EA_types(self, engine, data, request): else: csv_output = StringIO(csv_output) result = pd.read_csv( - csv_output, dtype={"with_dtype": str(data.dtype)}, engine=engine + csv_output, + dtype={"with_dtype": str(data.dtype)}, + engine=engine, + dtype_backend=dtype_backend, ) expected = df self.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 873dddc9a4d10..915cc9a9a1f95 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -538,17 +538,3 @@ def test_ea_int_avoid_overflow(all_parsers): } ) tm.assert_frame_equal(result, expected) - - -def test_dtype_bool_pyarrow(all_parsers): - # GH 53390 - pytest.importorskip("pyarrow") - parser = all_parsers - data = """col -True -False -True -""" - result = parser.read_csv(StringIO(data), dtype={"col": "bool[pyarrow]"}) - expected = DataFrame({"col": [True, False, True]}, dtype="bool[pyarrow]") - tm.assert_frame_equal(result, expected) From 1d47b80874c02be979748b89e163119cfd583688 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 25 May 2023 16:00:24 -0700 Subject: [PATCH 3/4] Clarify whatsnew --- doc/source/whatsnew/v2.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 5a9b6ddee5bde..319cf2df62c28 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -31,7 +31,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) -- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` (:issue:`53390`) +- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) From c312dbe49fd46f992d064dea5f5c30bb91e4c0b6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 May 2023 13:29:47 -0700 Subject: [PATCH 4/4] Move to 2.0.3 --- doc/source/whatsnew/v2.0.2.rst | 1 - doc/source/whatsnew/v2.0.3.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 3d04d60032501..559078d501a00 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -31,7 +31,6 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) -- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) diff --git a/doc/source/whatsnew/v2.0.3.rst b/doc/source/whatsnew/v2.0.3.rst index 73779d7e4cc74..a25999d8085cb 100644 --- a/doc/source/whatsnew/v2.0.3.rst +++ b/doc/source/whatsnew/v2.0.3.rst @@ -20,7 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`) .. --------------------------------------------------------------------------- .. _whatsnew_203.other: