doc fixups; addl tests

chris-b1 · chris-b1 · commit 12547687bc16 · 2016-08-04T18:09:20.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -500,12 +500,14 @@ worth trying.
    data that was read in. It is important to note that the overall column will be
    marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes.
 
+.. _io.categorical:
+
 Specifying Categorical dtype
 ''''''''''''''''''''''''''''
 
 .. versionadded:: 0.19.0
 
-`Categorical` columns can be parsed directly by specifying `dtype='category'`
+``Categorical`` columns can be parsed directly by specifying ``dtype='category'``
 
 .. ipython :: python
 
@@ -515,25 +517,26 @@ Specifying Categorical dtype
    pd.read_csv(StringIO(data)).dtypes
    pd.read_csv(StringIO(data), dtype='category').dtypes
 
-Individual columns can be parsed as a `Categorical` using a dict specification
+Individual columns can be parsed as a ``Categorical`` using a dict specification
 
-.. ipython :: python
+.. ipython:: python
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
 .. note::
 
    The resulting categories will always be parsed as string (object dtype).
-   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+   If the categories are numeric they can be converted using the
+   :func:`pd.to_numeric` function, or as appropriate, another converter
+   such as :func:`pd.to_datetime`.
 
-   .. ipython :: python
+   .. ipython:: python
 
       df = pd.read_csv(StringIO(data), dtype='category')
       df.dtypes
       df['col3']
       df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
       df['col3']
->>>>>>> undo type inference add docs and asv
 
 
 Naming and Using Columns
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -12,6 +12,7 @@ Highlights include:
 - :func:`merge_asof` for asof-style time-series joining, see :ref:`here <whatsnew_0190.enhancements.asof_merge>`
 - ``.rolling()`` are now time-series aware, see :ref:`here <whatsnew_0190.enhancements.rolling_ts>`
 - pandas development api, see :ref:`here <whatsnew_0190.dev_api>`
+- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here <whatsnew_0190.enhancements.read_csv_categorical>`
 
 .. contents:: What's new in v0.19.0
     :local:
@@ -233,34 +234,36 @@ New behaviour:
 
 .. _whatsnew_0190.enhancements.read_csv_categorical:
 
-:func:`read_csv` supports parsing `Categorical` directly
+:func:`read_csv` supports parsing ``Categorical`` directly
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The :func:`read_csv` function now supports parsing a `Categorical` column when
+The :func:`read_csv` function now supports parsing a ``Categorical`` column when
 specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
-this can result in a faster parse time and lower memory usage, compared to
-converting to `Categorical` after parsing.
+this can result in a faster parse time and lower memory usage compared to
+converting to ``Categorical`` after parsing.  See the io :ref:`docs here <io.categorical>`
 
-.. ipython :: python
+.. ipython:: python
 
    data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
 
    pd.read_csv(StringIO(data))
    pd.read_csv(StringIO(data)).dtypes
    pd.read_csv(StringIO(data), dtype='category').dtypes
 
-Individual columns can be parsed as a `Categorical` using a dict specification
+Individual columns can be parsed as a ``Categorical`` using a dict specification
 
-.. ipython :: python
+.. ipython:: python
 
    pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
 
 .. note::
 
    The resulting categories will always be parsed as string (object dtype).
-   Numeric categories can be converted using the :func:`pd.to_numeric` function.
+   If the categories are numeric they can be converted using the
+   :func:`pd.to_numeric` function, or as appropriate, another converter
+   such as :func:`pd.to_datetime`.
 
-   .. ipython :: python
+   .. ipython:: python
 
       df = pd.read_csv(StringIO(data), dtype='category')
       df.dtypes
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -136,6 +136,11 @@ def test_passing_dtype(self):
                               dtype={'A': 'timedelta64', 'B': 'float64'},
                               index_col=0)
 
+            # valid but unsupported - fixed width unicode string
+            self.assertRaises(TypeError, self.read_csv, path,
+                              dtype={'A': 'U8'},
+                              index_col=0)
+
         # see gh-12048: empty frame
         actual = self.read_csv(StringIO('A,B'), dtype=str)
         expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str)
@@ -255,6 +260,23 @@ def test_categorical_dtype_encoding(self):
         actual = actual.apply(lambda x: x.astype(object))
         tm.assert_frame_equal(actual, expected)
 
+    def test_categorical_dtype_chunksize(self):
+        # GH 10153
+        data = """a,b
+1,a
+1,b
+1,b
+2,c"""
+        expecteds = [pd.DataFrame({'a': [1, 1],
+                                   'b': Categorical(['a', 'b'])}),
+                     pd.DataFrame({'a': [1, 2],
+                                   'b': Categorical(['b', 'c'])})]
+        actuals = self.read_csv(StringIO(data), dtype={'b':'category'},
+                                chunksize=2)
+
+        for actual, expected in zip(actuals, expecteds):
+            tm.assert_frame_equal(actual, expected)
+
     def test_pass_dtype_as_recarray(self):
         if compat.is_platform_windows() and self.low_memory:
             raise nose.SkipTest(
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1173,7 +1173,7 @@ cdef class TextReader:
         elif dtype.kind == 'U':
             width = dtype.itemsize
             if width > 0:
-                raise NotImplementedError("the dtype %s is not supported for parsing" % dtype)
+                raise TypeError("the dtype %s is not supported for parsing" % dtype)
 
             # unicode variable width
             return self._string_convert(i, start, end, na_filter,
@@ -1187,10 +1187,10 @@ cdef class TextReader:
         elif is_object_dtype(dtype):
             return self._string_convert(i, start, end, na_filter,
                                         na_hashset)
+        elif is_datetime64_dtype(dtype):
+            raise TypeError("the dtype %s is not supported for parsing, "
+                            "pass this column using parse_dates instead" % dtype)
         else:
-            if is_datetime64_dtype(dtype):
-                 raise TypeError("the dtype %s is not supported for parsing, "
-                                 "pass this column using parse_dates instead" % dtype)
             raise TypeError("the dtype %s is not supported for parsing" % dtype)
 
     cdef _string_convert(self, Py_ssize_t i, int start, int end,