Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
Code Sample
import io
import pandas as pd
example_csv = """
boolean_column, string_column
1.0, a
0.0, b
,c"""
csv = io.StringIO(example_csv)
df = pd.read_csv(csv, dtype={"boolean_column": "boolean"})
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-c698a4bcbc2d> in <module>
1 csv = io.StringIO(example_csv)
----> 2 df = pd.read_csv(csv, dtype={"boolean_column": "boolean"})
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
674 )
675
--> 676 return _read(filepath_or_buffer, kwds)
677
678 parser_f.__name__ = name
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
452
453 try:
--> 454 data = parser.read(nrows)
455 finally:
456 parser.close()
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1131 def read(self, nrows=None):
1132 nrows = _validate_integer("nrows", nrows)
-> 1133 ret = self._engine.read(nrows)
1134
1135 # May alter columns / col_dict
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
2035 def read(self, nrows=None):
2036 try:
-> 2037 data = self._reader.read(nrows)
2038 except StopIteration:
2039 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in _from_sequence_of_strings(cls, strings, dtype, copy)
302 raise ValueError(f"{s} cannot be cast to bool")
303
--> 304 scalars = [map_string(x) for x in strings]
305 return cls._from_sequence(scalars, dtype, copy)
306
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in <listcomp>(.0)
302 raise ValueError(f"{s} cannot be cast to bool")
303
--> 304 scalars = [map_string(x) for x in strings]
305 return cls._from_sequence(scalars, dtype, copy)
306
c:\users\abbooda\appdata\local\continuum\miniconda3\envs\divi\lib\site-packages\pandas\core\arrays\boolean.py in map_string(s)
300 return False
301 else:
--> 302 raise ValueError(f"{s} cannot be cast to bool")
303
304 scalars = [map_string(x) for x in strings]
ValueError: 1.0 cannot be cast to bool
Problem description
I would expect that 1.0, 0.0 and None can be cast to a nullable boolean. Using pd.read_csv()
without dtype assignment assigns float64 dtype. Why does "boolean" not work?
Expected Output
print(df)
>>> boolean_column string_column
0 True a
1 False b
2 <NA> c
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.7.6.final.0
python-bits : 64
OS : Windows
OS-release : 10
machine : AMD64
processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : None.None
pandas : 1.0.4
numpy : 1.18.1
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 47.3.0.post20200616
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.15.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.2.1
numexpr : None
odfpy : None
openpyxl : 3.0.3
pandas_gbq : None
pyarrow : None
pytables : None
pytest : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None
numba : None