From 1db0f62a516dd9c42c702bbefd22bcf5e99ada58 Mon Sep 17 00:00:00 2001
From: Joel Gibson <joel@jgibson.id.au>
Date: Mon, 9 Aug 2021 16:25:22 +1000
Subject: [PATCH 1/2] BUG: read_csv with mixed bools and NaNs sometimes reads
 NaNs as 1.0 (#42808)

---
 doc/source/whatsnew/v1.4.0.rst           |  1 +
 pandas/_libs/parsers.pyx                 | 10 ++++++++++
 pandas/tests/io/parser/test_na_values.py | 14 ++++++++++++++
 3 files changed, 25 insertions(+)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index f86c45ae8a86c..8559f5e6f627a 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -260,6 +260,7 @@ I/O
 - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`)
 - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`)
 - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`)
+- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`)
 -
 
 Period
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 25028b06f7bad..1d5715595b7ae 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1091,6 +1091,16 @@ cdef class TextReader:
         # we had a fallback parse on the dtype, so now try to cast
         # only allow safe casts, eg. with a nan you cannot safely cast to int
         if col_res is not None and col_dtype is not None:
+            # If col_res is bool, it might actually be a bool array mixed with NaNs
+            # (see _try_bool_flex()). Usually this would be taken care of using
+            # _maybe_upcast(), but if col_dtype is a floating type we should just
+            # take care of that cast here.
+            if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
+                mask = col_res.view(np.uint8) == na_values[np.uint8]
+                col_res = col_res.astype(col_dtype)
+                np.putmask(col_res, mask, np.nan)
+                return col_res, na_count
+
             try:
                 col_res = col_res.astype(col_dtype, casting='safe')
             except TypeError:
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 2880bf8690b46..faf4ce0430d80 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -590,3 +590,17 @@ def test_nan_multi_index(all_parsers):
     )
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_bool_to_float_with_nans(all_parsers):
+    # GH 42808: Ensure that when reading a file of mixed-bools-and-nans to a
+    # float dtype, we get back the correct result.
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    result = parser.read_csv(StringIO(data), dtype="float")
+    expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
+    tm.assert_frame_equal(result, expected)

From da459d15e3e8af4cab584e36b00802bb8a2ae48a Mon Sep 17 00:00:00 2001
From: Joel Gibson <joel@jgibson.id.au>
Date: Tue, 10 Aug 2021 15:23:52 +1000
Subject: [PATCH 2/2] Updated for the mixed-bool => int case.

---
 pandas/_libs/parsers.pyx                 | 15 +++++++++++-
 pandas/tests/io/parser/test_na_values.py | 29 +++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 1d5715595b7ae..3f0d2b7249950 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1089,7 +1089,6 @@ cdef class TextReader:
                     break
 
         # we had a fallback parse on the dtype, so now try to cast
-        # only allow safe casts, eg. with a nan you cannot safely cast to int
         if col_res is not None and col_dtype is not None:
             # If col_res is bool, it might actually be a bool array mixed with NaNs
             # (see _try_bool_flex()). Usually this would be taken care of using
@@ -1101,6 +1100,20 @@ cdef class TextReader:
                 np.putmask(col_res, mask, np.nan)
                 return col_res, na_count
 
+            # Similar special case for bool => int.
+            if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
+                # Must throw if there were NaNs.
+                if na_count > 0:
+                    raise ValueError(
+                        f"cannot safely convert passed user dtype of "
+                        f"{col_dtype} for {np.bool_} dtyped data in "
+                        f"column {i} due to NA values"
+                    )
+
+                # Falls through to safe cast below.
+                pass
+
+            # only allow safe casts, eg. with a nan you cannot safely cast to int
             try:
                 col_res = col_res.astype(col_dtype, casting='safe')
             except TypeError:
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index faf4ce0430d80..28e468e539132 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -592,9 +592,32 @@ def test_nan_multi_index(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_bool_to_float_with_nans(all_parsers):
-    # GH 42808: Ensure that when reading a file of mixed-bools-and-nans to a
-    # float dtype, we get back the correct result.
+def test_bool_and_nan_to_bool(all_parsers):
+    # GH 42808: (bool | NaN) => bool should error.
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    with pytest.raises(ValueError, match="NA values"):
+        parser.read_csv(StringIO(data), dtype="bool")
+
+
+def test_bool_and_nan_to_int(all_parsers):
+    # GH 42808: (bool | NaN) => int should error.
+    parser = all_parsers
+    data = """0
+NaN
+True
+False
+"""
+    with pytest.raises(ValueError, match="convert"):
+        print(parser.read_csv(StringIO(data), dtype="int"))
+
+
+def test_bool_and_nan_to_float(all_parsers):
+    # GH 42808: (bool | NaN) => float should return 0.0/1.0/NaN.
     parser = all_parsers
     data = """0
 NaN