Description
Hello, I've found a corner case where specifying category
dtypes in pd.read_csv
causes an error when it ought to return an empty dataframe.
A small, complete example of the issue
pd.read_csv(StringIO(''), names=['a'], dtype={'a': 'object'}) #works
pd.read_csv(StringIO(''), names=['a'], dtype={'a': 'category'}) #breaks
Expected Output
---------------------------------------------------------------------------
StopIteration Traceback (most recent call last)
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
1506 try:
-> 1507 data = self._reader.read(nrows)
1508 except StopIteration:
pandas/parser.pyx in pandas.parser.TextReader.read (pandas/parser.c:10364)()
pandas/parser.pyx in pandas.parser.TextReader._read_low_memory (pandas/parser.c:11033)()
StopIteration:
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-13-819a0c4e04cb> in <module>()
----> 1 pd.read_csv(StringIO(''), header=None, names=['a'], dtype={'a': 'category'})
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
643 skip_blank_lines=skip_blank_lines)
644
--> 645 return _read(filepath_or_buffer, kwds)
646
647 parser_f.__name__ = name
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
398 return parser
399
--> 400 data = parser.read()
401 parser.close()
402 return data
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
936 raise ValueError('skipfooter not supported for iteration')
937
--> 938 ret = self._engine.read(nrows)
939
940 if self.options.get('as_recarray'):
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
1513 index, columns, col_dict = _get_empty_meta(
1514 names, self.index_col, self.index_names,
-> 1515 dtype=self.kwds.get('dtype'))
1516
1517 if self.usecols is not None:
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _get_empty_meta(columns, index_col, index_names, dtype)
2803 col_dict = dict((col_name,
2804 np.empty(0, dtype=dtype.get(col_name, np.object)))
-> 2805 for col_name in columns)
2806
2807 return index, columns, col_dict
/Users/dspitz/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in <genexpr>(.0)
2803 col_dict = dict((col_name,
2804 np.empty(0, dtype=dtype.get(col_name, np.object)))
-> 2805 for col_name in columns)
2806
2807 return index, columns, col_dict
TypeError: data type "category" not understood
Appears the problem is in _get_empty_meta()
where the category
dtype is passed along to np.empty
. I don't know the idiomatic way to construct an empty Categorical series but that is what needs to happen.
Output of pd.show_versions()
pandas: 0.19.1
nose: None
pip: 8.1.1
setuptools: 20.3
Cython: 0.24.1
numpy: 1.11.2
scipy: 0.18.1
statsmodels: None
xarray: None
IPython: 5.1.0
sphinx: None
patsy: None
dateutil: 2.5.3
pytz: 2016.3
blosc: None
bottleneck: None
tables: 3.2.2
numexpr: 2.6.1
matplotlib: 1.5.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: None
httplib2: None
apiclient: None
sqlalchemy: 1.0.12
pymysql: 0.6.7.None
psycopg2: 2.6.1 (dt dec pq3 ext)
jinja2: 2.8
boto: None
pandas_datareader: None