From 88a19f35d10a81d7446d34bc6d704e001b40abc7 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Wed, 15 Dec 2021 15:15:14 +0100
Subject: [PATCH 1/2] BUG: read_csv converting nans to 1 when casting bools to
 float

---
 doc/source/whatsnew/v1.4.0.rst           |  1 +
 pandas/_libs/parsers.pyx                 | 21 +++++++++++-
 pandas/tests/io/parser/test_na_values.py | 43 ++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 372f991d96a22..13f5d223bef9a 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -753,6 +753,7 @@ I/O
 - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
 - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
+- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`)
 - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
 - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`)
 - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index c76bfab51aacd..632676b4cd11b 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1093,8 +1093,27 @@ cdef class TextReader:
                     break
 
         # we had a fallback parse on the dtype, so now try to cast
-        # only allow safe casts, eg. with a nan you cannot safely cast to int
         if col_res is not None and col_dtype is not None:
+            # If col_res is bool, it might actually be a bool array mixed with NaNs
+            # (see _try_bool_flex()). Usually this would be taken care of using
+            # _maybe_upcast(), but if col_dtype is a floating type we should just
+            # take care of that cast here.
+            if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
+                mask = col_res.view(np.uint8) == na_values[np.uint8]
+                col_res = col_res.astype(col_dtype)
+                np.putmask(col_res, mask, np.nan)
+                return col_res, na_count
+
+            # NaNs are already cast to True here, so can not use astype
+            if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
+                if na_count > 0:
+                    raise ValueError(
+                        f"cannot safely convert passed user dtype of "
+                        f"{col_dtype} for {np.bool_} dtyped data in "
+                        f"column {i} due to NA values"
+                    )
+
+            # only allow safe casts, eg. with a nan you cannot safely cast to int
             try:
                 col_res = col_res.astype(col_dtype, casting='safe')
             except TypeError:
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 101d3b565712d..75762fc57bff3 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -17,6 +17,7 @@
 import pandas._testing as tm
 
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
+xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 
 
 @skip_pyarrow
@@ -615,3 +616,45 @@ def test_nan_multi_index(all_parsers):
     )
 
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_bool_and_nan_to_bool(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    with pytest.raises(ValueError, match="NA values"):
+        parser.read_csv(StringIO(data), dtype="bool")
+
+
+def test_bool_and_nan_to_int(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    if parser.engine == "pyarrow":
+        with pytest.raises(TypeError, match="not 'NoneType'"):
+            parser.read_csv(StringIO(data), dtype="int")
+    else:
+        with pytest.raises(ValueError, match="convert"):
+            parser.read_csv(StringIO(data), dtype="int")
+
+
+def test_bool_and_nan_to_float(all_parsers):
+    # GH#42808
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    result = parser.read_csv(StringIO(data), dtype="float")
+    expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
+    tm.assert_frame_equal(result, expected)

From e2e160f166139b98b154ea066389547a9d0abb3f Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 17 Dec 2021 00:18:52 +0100
Subject: [PATCH 2/2] Move fix

---
 pandas/io/parsers/arrow_parser_wrapper.py | 6 +++++-
 pandas/tests/io/parser/test_na_values.py  | 8 ++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 98d1315c6212c..96f7f9b1738b8 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
                 frame.index.names = [None] * len(frame.index.names)
 
         if self.kwds.get("dtype") is not None:
-            frame = frame.astype(self.kwds.get("dtype"))
+            try:
+                frame = frame.astype(self.kwds.get("dtype"))
+            except TypeError as e:
+                # GH#44901 reraise to keep api consistent
+                raise ValueError(e)
         return frame
 
     def read(self) -> DataFrame:
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 75762fc57bff3..f9356dfc7d0e3 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -639,12 +639,8 @@ def test_bool_and_nan_to_int(all_parsers):
 True
 False
 """
-    if parser.engine == "pyarrow":
-        with pytest.raises(TypeError, match="not 'NoneType'"):
-            parser.read_csv(StringIO(data), dtype="int")
-    else:
-        with pytest.raises(ValueError, match="convert"):
-            parser.read_csv(StringIO(data), dtype="int")
+    with pytest.raises(ValueError, match="convert|NoneType"):
+        parser.read_csv(StringIO(data), dtype="int")
 
 
 def test_bool_and_nan_to_float(all_parsers):