Skip to content

Commit 54e7fe9

Browse files
authored
Backport PR #53391: BUG: read_csv with dtype=bool[pyarrow] (#53472)
* Backport PR #53391: BUG: read_csv with dtype=bool[pyarrow] * Add xfail
1 parent 8bc5245 commit 54e7fe9

File tree

4 files changed

+27
-10
lines changed

4 files changed

+27
-10
lines changed

doc/source/whatsnew/v2.0.3.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Fixed regressions
2020

2121
Bug fixes
2222
~~~~~~~~~
23-
-
23+
- Bug in :func:`read_csv` when defining ``dtype`` with ``bool[pyarrow]`` for the ``"c"`` and ``"python"`` engines (:issue:`53390`)
2424

2525
.. ---------------------------------------------------------------------------
2626
.. _whatsnew_203.other:

pandas/_libs/parsers.pyx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ from pandas.core.dtypes.common import (
117117
from pandas.core.dtypes.dtypes import CategoricalDtype
118118
from pandas.core.dtypes.inference import is_dict_like
119119

120+
from pandas.core.arrays.boolean import BooleanDtype
121+
120122
cdef:
121123
float64_t INF = <float64_t>np.inf
122124
float64_t NEGINF = -INF
@@ -1167,7 +1169,9 @@ cdef class TextReader:
11671169
array_type = dtype.construct_array_type()
11681170
try:
11691171
# use _from_sequence_of_strings if the class defines it
1170-
if is_bool_dtype(dtype):
1172+
if isinstance(dtype, BooleanDtype):
1173+
# xref GH 47534: BooleanArray._from_sequence_of_strings has extra
1174+
# kwargs
11711175
true_values = [x.decode() for x in self.true_values]
11721176
false_values = [x.decode() for x in self.false_values]
11731177
result = array_type._from_sequence_of_strings(

pandas/io/parsers/base_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
FloatingArray,
8181
IntegerArray,
8282
)
83+
from pandas.core.arrays.boolean import BooleanDtype
8384
from pandas.core.indexes.api import (
8485
Index,
8586
MultiIndex,
@@ -800,7 +801,7 @@ def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLi
800801
elif isinstance(cast_type, ExtensionDtype):
801802
array_type = cast_type.construct_array_type()
802803
try:
803-
if is_bool_dtype(cast_type):
804+
if isinstance(cast_type, BooleanDtype):
804805
# error: Unexpected keyword argument "true_values" for
805806
# "_from_sequence_of_strings" of "ExtensionArray"
806807
return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501

pandas/tests/extension/test_arrow.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545

4646
import pandas as pd
4747
import pandas._testing as tm
48+
from pandas.api.extensions import no_default
4849
from pandas.api.types import (
4950
is_bool_dtype,
5051
is_float_dtype,
@@ -738,14 +739,11 @@ def test_setitem_preserves_views(self, data):
738739

739740

740741
class TestBaseParsing(base.BaseParsingTests):
742+
@pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default])
741743
@pytest.mark.parametrize("engine", ["c", "python"])
742-
def test_EA_types(self, engine, data, request):
744+
def test_EA_types(self, engine, data, dtype_backend, request):
743745
pa_dtype = data.dtype.pyarrow_dtype
744-
if pa.types.is_boolean(pa_dtype):
745-
request.node.add_marker(
746-
pytest.mark.xfail(raises=TypeError, reason="GH 47534")
747-
)
748-
elif pa.types.is_decimal(pa_dtype):
746+
if pa.types.is_decimal(pa_dtype):
749747
request.node.add_marker(
750748
pytest.mark.xfail(
751749
raises=NotImplementedError,
@@ -763,14 +761,28 @@ def test_EA_types(self, engine, data, request):
763761
request.node.add_marker(
764762
pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
765763
)
764+
elif (
765+
pa.types.is_duration(pa_dtype)
766+
and dtype_backend == "pyarrow"
767+
and engine == "python"
768+
):
769+
request.node.add_marker(
770+
pytest.mark.xfail(
771+
raises=TypeError,
772+
reason="Invalid type for timedelta scalar: NAType",
773+
)
774+
)
766775
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
767776
csv_output = df.to_csv(index=False, na_rep=np.nan)
768777
if pa.types.is_binary(pa_dtype):
769778
csv_output = BytesIO(csv_output)
770779
else:
771780
csv_output = StringIO(csv_output)
772781
result = pd.read_csv(
773-
csv_output, dtype={"with_dtype": str(data.dtype)}, engine=engine
782+
csv_output,
783+
dtype={"with_dtype": str(data.dtype)},
784+
engine=engine,
785+
dtype_backend=dtype_backend,
774786
)
775787
expected = df
776788
self.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)