Skip to content

BUG: ValueError: 1.0 and 0.0 cannot be cast to bool when using pd.read_csv() #34859

Closed
@aauss

Description

@aauss
  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • (optional) I have confirmed this bug exists on the master branch of pandas.


Code Sample

import io
import pandas as pd

example_csv = """
boolean_column, string_column
1.0, a
0.0, b
,c"""

csv = io.StringIO(example_csv)
df = pd.read_csv(csv, dtype={"boolean_column": "boolean"})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-c698a4bcbc2d> in <module>
      1 csv = io.StringIO(example_csv)
----> 2 df = pd.read_csv(csv, dtype={"boolean_column": "boolean"})

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    674         )
    675 
--> 676         return _read(filepath_or_buffer, kwds)
    677 
    678     parser_f.__name__ = name

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    452 
    453     try:
--> 454         data = parser.read(nrows)
    455     finally:
    456         parser.close()

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   1131     def read(self, nrows=None):
   1132         nrows = _validate_integer("nrows", nrows)
-> 1133         ret = self._engine.read(nrows)
   1134 
   1135         # May alter columns / col_dict

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   2035     def read(self, nrows=None):
   2036         try:
-> 2037             data = self._reader.read(nrows)
   2038         except StopIteration:
   2039             if self._first_chunk:

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in _from_sequence_of_strings(cls, strings, dtype, copy)
    302                 raise ValueError(f"{s} cannot be cast to bool")
    303 
--> 304         scalars = [map_string(x) for x in strings]
    305         return cls._from_sequence(scalars, dtype, copy)
    306 

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in <listcomp>(.0)
    302                 raise ValueError(f"{s} cannot be cast to bool")
    303 
--> 304         scalars = [map_string(x) for x in strings]
    305         return cls._from_sequence(scalars, dtype, copy)
    306 

c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in map_string(s)
    300                 return False
    301             else:
--> 302                 raise ValueError(f"{s} cannot be cast to bool")
    303 
    304         scalars = [map_string(x) for x in strings]

ValueError: 1.0 cannot be cast to bool

Problem description

I would expect that 1.0, 0.0 and None can be cast to a nullable boolean. Using pd.read_csv() without dtype assignment assigns float64 dtype. Why does "boolean" not work?

Expected Output

print(df)
>>>   boolean_column  string_column
0            True              a
1           False              b
2            <NA>              c

Output of pd.show_versions()

INSTALLED VERSIONS

commit : None
python : 3.7.6.final.0
python-bits : 64
OS : Windows
OS-release : 10
machine : AMD64
processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : None.None

pandas : 1.0.4
numpy : 1.18.1
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 47.3.0.post20200616
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.15.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.2.1
numexpr : None
odfpy : None
openpyxl : 3.0.3
pandas_gbq : None
pyarrow : None
pytables : None
pytest : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions