From 88a19f35d10a81d7446d34bc6d704e001b40abc7 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 15 Dec 2021 15:15:14 +0100 Subject: [PATCH 1/2] BUG: read_csv converting nans to 1 when casting bools to float --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/_libs/parsers.pyx | 21 +++++++++++- pandas/tests/io/parser/test_na_values.py | 43 ++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 372f991d96a22..13f5d223bef9a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -753,6 +753,7 @@ I/O - Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) +- Bug in :func:`read_csv` where reading a mixed column of booleans and missing values to a float type results in the missing values becoming 1.0 rather than NaN (:issue:`42808`, :issue:`34120`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) - Bug in :func:`read_csv` silently ignoring errors when failling to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c76bfab51aacd..632676b4cd11b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1093,8 +1093,27 @@ cdef class TextReader: break # we had a fallback parse on the dtype, so now try to cast - # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: + # If col_res is bool, it might actually be a bool array mixed with NaNs + # (see _try_bool_flex()). Usually this would be taken care of using + # _maybe_upcast(), but if col_dtype is a floating type we should just + # take care of that cast here. + if col_res.dtype == np.bool_ and is_float_dtype(col_dtype): + mask = col_res.view(np.uint8) == na_values[np.uint8] + col_res = col_res.astype(col_dtype) + np.putmask(col_res, mask, np.nan) + return col_res, na_count + + # NaNs are already cast to True here, so can not use astype + if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype): + if na_count > 0: + raise ValueError( + f"cannot safely convert passed user dtype of " + f"{col_dtype} for {np.bool_} dtyped data in " + f"column {i} due to NA values" + ) + + # only allow safe casts, eg. with a nan you cannot safely cast to int try: col_res = col_res.astype(col_dtype, casting='safe') except TypeError: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 101d3b565712d..75762fc57bff3 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -17,6 +17,7 @@ import pandas._testing as tm skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @skip_pyarrow @@ -615,3 +616,45 @@ def test_nan_multi_index(all_parsers): ) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_bool_and_nan_to_bool(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + with pytest.raises(ValueError, match="NA values"): + parser.read_csv(StringIO(data), dtype="bool") + + +def test_bool_and_nan_to_int(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + if parser.engine == "pyarrow": + with pytest.raises(TypeError, match="not 'NoneType'"): + parser.read_csv(StringIO(data), dtype="int") + else: + with pytest.raises(ValueError, match="convert"): + parser.read_csv(StringIO(data), dtype="int") + + +def test_bool_and_nan_to_float(all_parsers): + # GH#42808 + parser = all_parsers + data = """0 +NaN +True +False +""" + result = parser.read_csv(StringIO(data), dtype="float") + expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) + tm.assert_frame_equal(result, expected) From e2e160f166139b98b154ea066389547a9d0abb3f Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 17 Dec 2021 00:18:52 +0100 Subject: [PATCH 2/2] Move fix --- pandas/io/parsers/arrow_parser_wrapper.py | 6 +++++- pandas/tests/io/parser/test_na_values.py | 8 ++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 98d1315c6212c..96f7f9b1738b8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -130,7 +130,11 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: - frame = frame.astype(self.kwds.get("dtype")) + try: + frame = frame.astype(self.kwds.get("dtype")) + except TypeError as e: + # GH#44901 reraise to keep api consistent + raise ValueError(e) return frame def read(self) -> DataFrame: diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 75762fc57bff3..f9356dfc7d0e3 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -639,12 +639,8 @@ def test_bool_and_nan_to_int(all_parsers): True False """ - if parser.engine == "pyarrow": - with pytest.raises(TypeError, match="not 'NoneType'"): - parser.read_csv(StringIO(data), dtype="int") - else: - with pytest.raises(ValueError, match="convert"): - parser.read_csv(StringIO(data), dtype="int") + with pytest.raises(ValueError, match="convert|NoneType"): + parser.read_csv(StringIO(data), dtype="int") def test_bool_and_nan_to_float(all_parsers):