Skip to content

Commit 1254768

Browse files
committed
doc fixups; addl tests
1 parent 2490949 commit 1254768

File tree

4 files changed

+47
-19
lines changed

4 files changed

+47
-19
lines changed

doc/source/io.rst

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -500,12 +500,14 @@ worth trying.
500500
data that was read in. It is important to note that the overall column will be
501501
marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
502502

503+
.. _io.categorical:
504+
503505
Specifying Categorical dtype
504506
''''''''''''''''''''''''''''
505507

506508
.. versionadded:: 0.19.0
507509

508-
`Categorical` columns can be parsed directly by specifying `dtype='category'`
510+
``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
509511

510512
.. ipython :: python
511513
@@ -515,25 +517,26 @@ Specifying Categorical dtype
515517
pd.read_csv(StringIO(data)).dtypes
516518
pd.read_csv(StringIO(data), dtype='category').dtypes
517519
518-
Individual columns can be parsed as a `Categorical` using a dict specification
520+
Individual columns can be parsed as a ``Categorical`` using a dict specification
519521

520-
.. ipython :: python
522+
.. ipython:: python
521523
522524
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
523525
524526
.. note::
525527

526528
The resulting categories will always be parsed as string (object dtype).
527-
Numeric categories can be converted using the :func:`pd.to_numeric` function.
529+
If the categories are numeric they can be converted using the
530+
:func:`pd.to_numeric` function, or as appropriate, another converter
531+
such as :func:`pd.to_datetime`.
528532

529-
.. ipython :: python
533+
.. ipython:: python
530534
531535
df = pd.read_csv(StringIO(data), dtype='category')
532536
df.dtypes
533537
df['col3']
534538
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
535539
df['col3']
536-
>>>>>>> undo type inference add docs and asv
537540
538541
539542
Naming and Using Columns

doc/source/whatsnew/v0.19.0.txt

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Highlights include:
1212
- :func:`merge_asof` for asof-style time-series joining, see :ref:`here <whatsnew_0190.enhancements.asof_merge>`
1313
- ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
1414
- pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
15+
- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here <whatsnew_0190.enhancements.read_csv_categorical>`
1516

1617
.. contents:: What's new in v0.19.0
1718
:local:
@@ -233,34 +234,36 @@ New behaviour:
233234

234235
.. _whatsnew_0190.enhancements.read_csv_categorical:
235236

236-
:func:`read_csv` supports parsing `Categorical` directly
237+
:func:`read_csv` supports parsing ``Categorical`` directly
237238
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
238239

239-
The :func:`read_csv` function now supports parsing a `Categorical` column when
240+
The :func:`read_csv` function now supports parsing a ``Categorical`` column when
240241
specified as a dtype (:issue:`10153`). Depending on the structure of the data,
241-
this can result in a faster parse time and lower memory usage, compared to
242-
converting to `Categorical` after parsing.
242+
this can result in a faster parse time and lower memory usage compared to
243+
converting to ``Categorical`` after parsing. See the io :ref:`docs here <io.categorical>`
243244

244-
.. ipython :: python
245+
.. ipython:: python
245246

246247
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
247248

248249
pd.read_csv(StringIO(data))
249250
pd.read_csv(StringIO(data)).dtypes
250251
pd.read_csv(StringIO(data), dtype='category').dtypes
251252

252-
Individual columns can be parsed as a `Categorical` using a dict specification
253+
Individual columns can be parsed as a ``Categorical`` using a dict specification
253254

254-
.. ipython :: python
255+
.. ipython:: python
255256

256257
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
257258

258259
.. note::
259260

260261
The resulting categories will always be parsed as string (object dtype).
261-
Numeric categories can be converted using the :func:`pd.to_numeric` function.
262+
If the categories are numeric they can be converted using the
263+
:func:`pd.to_numeric` function, or as appropriate, another converter
264+
such as :func:`pd.to_datetime`.
262265

263-
.. ipython :: python
266+
.. ipython:: python
264267

265268
df = pd.read_csv(StringIO(data), dtype='category')
266269
df.dtypes

pandas/io/tests/parser/c_parser_only.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ def test_passing_dtype(self):
136136
dtype={'A': 'timedelta64', 'B': 'float64'},
137137
index_col=0)
138138

139+
# valid but unsupported - fixed width unicode string
140+
self.assertRaises(TypeError, self.read_csv, path,
141+
dtype={'A': 'U8'},
142+
index_col=0)
143+
139144
# see gh-12048: empty frame
140145
actual = self.read_csv(StringIO('A,B'), dtype=str)
141146
expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
@@ -255,6 +260,23 @@ def test_categorical_dtype_encoding(self):
255260
actual = actual.apply(lambda x: x.astype(object))
256261
tm.assert_frame_equal(actual, expected)
257262

263+
def test_categorical_dtype_chunksize(self):
264+
# GH 10153
265+
data = """a,b
266+
1,a
267+
1,b
268+
1,b
269+
2,c"""
270+
expecteds = [pd.DataFrame({'a': [1, 1],
271+
'b': Categorical(['a', 'b'])}),
272+
pd.DataFrame({'a': [1, 2],
273+
'b': Categorical(['b', 'c'])})]
274+
actuals = self.read_csv(StringIO(data), dtype={'b':'category'},
275+
chunksize=2)
276+
277+
for actual, expected in zip(actuals, expecteds):
278+
tm.assert_frame_equal(actual, expected)
279+
258280
def test_pass_dtype_as_recarray(self):
259281
if compat.is_platform_windows() and self.low_memory:
260282
raise nose.SkipTest(

pandas/parser.pyx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,7 +1173,7 @@ cdef class TextReader:
11731173
elif dtype.kind == 'U':
11741174
width = dtype.itemsize
11751175
if width > 0:
1176-
raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
1176+
raise TypeError("the dtype %s is not supported for parsing" % dtype)
11771177

11781178
# unicode variable width
11791179
return self._string_convert(i, start, end, na_filter,
@@ -1187,10 +1187,10 @@ cdef class TextReader:
11871187
elif is_object_dtype(dtype):
11881188
return self._string_convert(i, start, end, na_filter,
11891189
na_hashset)
1190+
elif is_datetime64_dtype(dtype):
1191+
raise TypeError("the dtype %s is not supported for parsing, "
1192+
"pass this column using parse_dates instead" % dtype)
11901193
else:
1191-
if is_datetime64_dtype(dtype):
1192-
raise TypeError("the dtype %s is not supported for parsing, "
1193-
"pass this column using parse_dates instead" % dtype)
11941194
raise TypeError("the dtype %s is not supported for parsing" % dtype)
11951195

11961196
cdef _string_convert(self, Py_ssize_t i, int start, int end,

0 commit comments

Comments
 (0)