diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 8754286ee7d11..441fac0a6d5a1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -260,6 +260,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) +- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`) - Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 25028b06f7bad..3f0d2b7249950 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1089,8 +1089,31 @@ cdef class TextReader: break # we had a fallback parse on the dtype, so now try to cast - # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: + # If col_res is bool, it might actually be a bool array mixed with NaNs + # (see _try_bool_flex()). Usually this would be taken care of using + # _maybe_upcast(), but if col_dtype is a floating type we should just + # take care of that cast here. + if col_res.dtype == np.bool_ and is_float_dtype(col_dtype): + mask = col_res.view(np.uint8) == na_values[np.uint8] + col_res = col_res.astype(col_dtype) + np.putmask(col_res, mask, np.nan) + return col_res, na_count + + # Similar special case for bool => int. + if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype): + # Must throw if there were NaNs. + if na_count > 0: + raise ValueError( + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {np.bool_} dtyped data in " + f"column {i} due to NA values" + ) + + # Falls through to safe cast below. + pass + + # only allow safe casts, eg. with a nan you cannot safely cast to int try: col_res = col_res.astype(col_dtype, casting='safe') except TypeError: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2880bf8690b46..28e468e539132 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -590,3 +590,40 @@ def test_nan_multi_index(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_bool_and_nan_to_bool(all_parsers): + # GH 42808: (bool | NaN) => bool should error. + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="NA values"): + parser.read_csv(StringIO(data), dtype="bool") + + +def test_bool_and_nan_to_int(all_parsers): + # GH 42808: (bool | NaN) => int should error. + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="convert"): + print(parser.read_csv(StringIO(data), dtype="int")) + + +def test_bool_and_nan_to_float(all_parsers): + # GH 42808: (bool | NaN) => float should return 0.0/1.0/NaN. + parser = all_parsers + data = """0 +NaN +True +False +""" + result = parser.read_csv(StringIO(data), dtype="float") + expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) + tm.assert_frame_equal(result, expected)