From 1db0f62a516dd9c42c702bbefd22bcf5e99ada58 Mon Sep 17 00:00:00 2001 From: Joel Gibson Date: Mon, 9 Aug 2021 16:25:22 +1000 Subject: [PATCH 1/2] BUG: read_csv with mixed bools and NaNs sometimes reads NaNs as 1.0 (#42808) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/parsers.pyx | 10 ++++++++++ pandas/tests/io/parser/test_na_values.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f86c45ae8a86c..8559f5e6f627a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -260,6 +260,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) +- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`) - Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 25028b06f7bad..1d5715595b7ae 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1091,6 +1091,16 @@ cdef class TextReader: # we had a fallback parse on the dtype, so now try to cast # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: + # If col_res is bool, it might actually be a bool array mixed with NaNs + # (see _try_bool_flex()). Usually this would be taken care of using + # _maybe_upcast(), but if col_dtype is a floating type we should just + # take care of that cast here. + if col_res.dtype == np.bool_ and is_float_dtype(col_dtype): + mask = col_res.view(np.uint8) == na_values[np.uint8] + col_res = col_res.astype(col_dtype) + np.putmask(col_res, mask, np.nan) + return col_res, na_count + try: col_res = col_res.astype(col_dtype, casting='safe') except TypeError: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2880bf8690b46..faf4ce0430d80 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -590,3 +590,17 @@ def test_nan_multi_index(all_parsers): ) tm.assert_frame_equal(result, expected) + + +def test_bool_to_float_with_nans(all_parsers): + # GH 42808: Ensure that when reading a file of mixed-bools-and-nans to a + # float dtype, we get back the correct result. + parser = all_parsers + data = """0 +NaN +True +False +""" + result = parser.read_csv(StringIO(data), dtype="float") + expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) + tm.assert_frame_equal(result, expected) From da459d15e3e8af4cab584e36b00802bb8a2ae48a Mon Sep 17 00:00:00 2001 From: Joel Gibson Date: Tue, 10 Aug 2021 15:23:52 +1000 Subject: [PATCH 2/2] Updated for the mixed-bool => int case. --- pandas/_libs/parsers.pyx | 15 +++++++++++- pandas/tests/io/parser/test_na_values.py | 29 +++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1d5715595b7ae..3f0d2b7249950 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1089,7 +1089,6 @@ cdef class TextReader: break # we had a fallback parse on the dtype, so now try to cast - # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: # If col_res is bool, it might actually be a bool array mixed with NaNs # (see _try_bool_flex()). Usually this would be taken care of using @@ -1101,6 +1100,20 @@ cdef class TextReader: np.putmask(col_res, mask, np.nan) return col_res, na_count + # Similar special case for bool => int. + if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype): + # Must throw if there were NaNs. + if na_count > 0: + raise ValueError( + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {np.bool_} dtyped data in " + f"column {i} due to NA values" + ) + + # Falls through to safe cast below. + pass + + # only allow safe casts, eg. with a nan you cannot safely cast to int try: col_res = col_res.astype(col_dtype, casting='safe') except TypeError: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index faf4ce0430d80..28e468e539132 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -592,9 +592,32 @@ def test_nan_multi_index(all_parsers): tm.assert_frame_equal(result, expected) -def test_bool_to_float_with_nans(all_parsers): - # GH 42808: Ensure that when reading a file of mixed-bools-and-nans to a - # float dtype, we get back the correct result. +def test_bool_and_nan_to_bool(all_parsers): + # GH 42808: (bool | NaN) => bool should error. + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="NA values"): + parser.read_csv(StringIO(data), dtype="bool") + + +def test_bool_and_nan_to_int(all_parsers): + # GH 42808: (bool | NaN) => int should error. + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="convert"): + print(parser.read_csv(StringIO(data), dtype="int")) + + +def test_bool_and_nan_to_float(all_parsers): + # GH 42808: (bool | NaN) => float should return 0.0/1.0/NaN. parser = all_parsers data = """0 NaN